From 9b5e097643d29e2ea43f4e4b64d1e5c3174fc749 Mon Sep 17 00:00:00 2001
From: kondors1995 <normandija1945@gmail.com>
Date: Sat, 17 Aug 2024 19:34:55 +0300
Subject: [PATCH 01/30] R6.4

---
 build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index 32cb41c6bcd6..7b5c09c6c83b 100755
--- a/build.sh
+++ b/build.sh
@@ -15,7 +15,7 @@ export THINLTO_CACHE=~/ltocache/
 DEFCONFIG="raphael_defconfig"
 
 # Kernel Details
-REV="R6.3"
+REV="R6.4"
 
 EDITION="BLACK"
 VER="$EDITION"-"$REV"

From f67b4d365ee52cc44eec595b4eb1b2a62c3bedac Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Sun, 4 Aug 2024 11:29:57 +0300
Subject: [PATCH 02/30] cpufreq: sched/schedutil: Remove LATENCY_MULTIPLIER

The current LATENCY_MULTIPLIER which has been around for nearly 20 years
causes rate_limit_us to be always in ms range.

On M1 mac mini I get 50 and 56us transition latency, but due to the 1000
multiplier we end up setting rate_limit_us to 50 and 56ms, which gets
capped into 2ms and was 10ms before e13aa799c2a6 ("cpufreq: Change
default transition delay to 2ms")

On Intel I5 system transition latency is 20us but due to the multiplier
we end up with 20ms that again is capped to 2ms.

Given how good modern hardware and how modern workloads require systems
to be more responsive to cater for sudden changes in workload (tasks
sleeping/wakeup/migrating, uclamp causing a sudden boost or cap) and
that 2ms is quarter of the time of 120Hz refresh rate system, drop the
old logic in favour of providing 50% headroom.

	rate_limit_us = 1.5 * latency.

I considered not adding any headroom which could mean that we can end up
with infinite back-to-back requests.

I also considered providing a constant headroom (e.g: 100us) assuming
that any h/w or f/w dealing with the request shouldn't require a large
headroom when transition_latency is actually high.

But for both cases I wasn't sure if h/w or f/w can end up being
overwhelmed dealing with the freq requests in a potentially busy system.
So I opted for providing 50% breathing room.

This is expected to impact schedutil only as the other user,
dbs_governor, takes the max(2*tick, transition_delay_us) and the former
was at least 2ms on 1ms TICK, which is equivalent to the max_delay_us
before applying this patch. For systems with TICK of 4ms, this value
would have almost always ended up with 8ms sampling rate.

For systems that report 0 transition latency, we still default to
returning 1ms as transition delay.

This helps in eliminating a source of latency for applying requests as
mentioned in [1]. For example if we have a 1ms tick, most systems will
miss sending an update at tick when updating the util_avg for a task/CPU
(rate_limit_us will be 2ms for most systems).

[1] https://lore.kernel.org/lkml/20240724212255.mfr2ybiv2j2uqek7@airbuntu/

Signed-off-by: Qais Yousef <qyousef@layalina.io>
---
 drivers/cpufreq/cpufreq.c | 18 ++++--------------
 include/linux/cpufreq.h   |  8 --------
 2 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 1992db8bda2c..c7bb93877d29 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -537,21 +537,11 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy)
 		return policy->transition_delay_us;
 
 	latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
-	if (latency) {
-		/*
-		 * For platforms that can change the frequency very fast (< 10
-		 * us), the above formula gives a decent transition delay. But
-		 * for platforms where transition_latency is in milliseconds, it
-		 * ends up giving unrealistic values.
-		 *
-		 * Cap the default transition delay to 10 ms, which seems to be
-		 * a reasonable amount of time after which we should reevaluate
-		 * the frequency.
-		 */
-		return min(latency * LATENCY_MULTIPLIER, (unsigned int)10000);
-	}
+	if (latency)
+		/* Give a 50% breathing room between updates */
+		return latency + (latency >> 1);
 
-	return LATENCY_MULTIPLIER;
+	return USEC_PER_MSEC;
 }
 EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us);
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 84188e180a2d..40e1fdee07f4 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -493,14 +493,6 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
 #define CPUFREQ_POLICY_POWERSAVE	(1)
 #define CPUFREQ_POLICY_PERFORMANCE	(2)
 
-/*
- * The polling frequency depends on the capability of the processor. Default
- * polling frequency is 1000 times the transition latency of the processor. The
- * ondemand governor will work on any processor with transition latency <= 10ms,
- * using appropriate sampling rate.
- */
-#define LATENCY_MULTIPLIER		(1000)
-
 struct cpufreq_governor {
 	char	name[CPUFREQ_NAME_LEN];
 	int	(*init)(struct cpufreq_policy *policy);

From 8f6e4918d3418f3f0a76a3a8940bca823addf924 Mon Sep 17 00:00:00 2001
From: kondors1995 <normandija1945@gmail.com>
Date: Fri, 23 Aug 2024 18:32:12 +0300
Subject: [PATCH 03/30] Revert "cpufreq: stats: replace the global lock with
 atomic"

This reverts commit 5056373df84ba21294efd178d9092071525c7f87.
---
 drivers/cpufreq/cpufreq_stats.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 1284bc003031..f32a88d57f88 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -14,13 +14,15 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 
+static DEFINE_SPINLOCK(cpufreq_stats_lock);
+
 struct cpufreq_stats {
 	unsigned int total_trans;
-	atomic64_t last_time;
+	unsigned long long last_time;
 	unsigned int max_state;
 	unsigned int state_num;
 	unsigned int last_index;
-	atomic64_t *time_in_state;
+	u64 *time_in_state;
 	unsigned int *freq_table;
 	unsigned int *trans_table;
 };
@@ -28,19 +30,21 @@ struct cpufreq_stats {
 static void cpufreq_stats_update(struct cpufreq_stats *stats)
 {
 	unsigned long long cur_time = get_jiffies_64();
-	unsigned long long time = cur_time;
+	unsigned long flags;
 
-	time = atomic64_xchg(&stats->last_time, time);
-	atomic64_add(cur_time - time, &stats->time_in_state[stats->last_index]);
+	spin_lock_irqsave(&cpufreq_stats_lock, flags);
+	stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
+	stats->last_time = cur_time;
+	spin_unlock_irqrestore(&cpufreq_stats_lock, flags);
 }
 
 static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
 {
 	unsigned int count = stats->max_state;
 
-	memset(stats->time_in_state, 0, count * sizeof(atomic64_t));
+	memset(stats->time_in_state, 0, count * sizeof(u64));
 	memset(stats->trans_table, 0, count * count * sizeof(int));
-	atomic64_set(&stats->last_time, get_jiffies_64());
+	stats->last_time = get_jiffies_64();
 	stats->total_trans = 0;
 }
 
@@ -59,8 +63,7 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	for (i = 0; i < stats->state_num; i++) {
 		len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
 			(unsigned long long)
-			jiffies_64_to_clock_t(atomic64_read(
-					&stats->time_in_state[i])));
+			jiffies_64_to_clock_t(stats->time_in_state[i]));
 	}
 	return len;
 }
@@ -178,7 +181,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy)
 	if (!stats)
 		return;
 
-	alloc_size = count * sizeof(int) + count * sizeof(atomic64_t);
+	alloc_size = count * sizeof(int) + count * sizeof(u64);
 
 	alloc_size += count * count * sizeof(int);
 
@@ -199,7 +202,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy)
 			stats->freq_table[i++] = pos->frequency;
 
 	stats->state_num = i;
-	atomic64_set(&stats->last_time, get_jiffies_64());
+	stats->last_time = get_jiffies_64();
 	stats->last_index = freq_table_get_index(stats, policy->cur);
 
 	policy->stats = stats;

From ff33b53a182245508593a15c96435bb729df4437 Mon Sep 17 00:00:00 2001
From: kondors1995 <normandija1945@gmail.com>
Date: Fri, 23 Aug 2024 18:38:58 +0300
Subject: [PATCH 04/30] Revert "cpufreq: record CPUFREQ stat for fast switch
 path"

This reverts commit eb4660ce1f8bf5c18ecc11e0eefd1444dabb160d.

# Conflicts:
#	drivers/cpufreq/cpufreq_stats.c
---
 drivers/cpufreq/cpufreq.c       |  4 +---
 drivers/cpufreq/cpufreq_stats.c | 11 ++++++++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c7bb93877d29..f319f7b4bd30 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1871,10 +1871,8 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 	target_freq = clamp_val(target_freq, policy->min, policy->max);
 
         ret = cpufreq_driver->fast_switch(policy, target_freq);
-	if (ret) {
+	if (ret)
 		cpufreq_times_record_transition(policy, ret);
-		cpufreq_stats_record_transition(policy, ret);
-	}
 
 	return ret;
 }
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index f32a88d57f88..5b8677e89063 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -30,12 +30,11 @@ struct cpufreq_stats {
 static void cpufreq_stats_update(struct cpufreq_stats *stats)
 {
 	unsigned long long cur_time = get_jiffies_64();
-	unsigned long flags;
 
-	spin_lock_irqsave(&cpufreq_stats_lock, flags);
+	spin_lock(&cpufreq_stats_lock);
 	stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
 	stats->last_time = cur_time;
-	spin_unlock_irqrestore(&cpufreq_stats_lock, flags);
+	spin_unlock(&cpufreq_stats_lock);
 }
 
 static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
@@ -59,6 +58,9 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	ssize_t len = 0;
 	int i;
 
+	if (policy->fast_switch_enabled)
+		return 0;
+
 	cpufreq_stats_update(stats);
 	for (i = 0; i < stats->state_num; i++) {
 		len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
@@ -82,6 +84,9 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 	ssize_t len = 0;
 	int i, j;
 
+	if (policy->fast_switch_enabled)
+		return 0;
+
 	len += scnprintf(buf + len, PAGE_SIZE - len, "   From  :    To\n");
 	len += scnprintf(buf + len, PAGE_SIZE - len, "         : ");
 	for (i = 0; i < stats->state_num; i++) {

From cbbed7e99468f3375bee586284354669efdf71d4 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 1 Feb 2019 11:45:44 +0530
Subject: [PATCH 05/30] cpufreq: stats: Declare freq-attr right after their
 callbacks

Freq attribute for "trans_table" is defined right after its callback
(without any blank line between them), but the others are defined
separately later on. Keep this consistent and define all attributes
right after their callbacks.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 5b8677e89063..291883ce664f 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -51,6 +51,7 @@ static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
 {
 	return sprintf(buf, "%d\n", policy->stats->total_trans);
 }
+cpufreq_freq_attr_ro(total_trans);
 
 static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 {
@@ -69,6 +70,7 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	}
 	return len;
 }
+cpufreq_freq_attr_ro(time_in_state);
 
 static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
 			   size_t count)
@@ -77,6 +79,7 @@ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
 	cpufreq_stats_clear_table(policy->stats);
 	return count;
 }
+cpufreq_freq_attr_wo(reset);
 
 static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 {
@@ -126,10 +129,6 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 }
 cpufreq_freq_attr_ro(trans_table);
 
-cpufreq_freq_attr_ro(total_trans);
-cpufreq_freq_attr_ro(time_in_state);
-cpufreq_freq_attr_wo(reset);
-
 static struct attribute *default_attrs[] = {
 	&total_trans.attr,
 	&time_in_state.attr,

From fa24b0591778600d83eb09c94d0a006b34076520 Mon Sep 17 00:00:00 2001
From: Eva Huang <evahuang@google.com>
Date: Fri, 29 Jul 2022 11:43:47 +0800
Subject: [PATCH 06/30] msm: gsi: remove the WARN_ON to avoid panic_on_warn
 issue on debug build.

Refer to QC case 06009586.

Bug: 201209987
Signed-off-by: Eva Huang <evahuang@google.com>
Change-Id: Ifb8089a9d288932f18a26606567e48d1dcf5eaf2
---
 drivers/platform/msm/gsi/gsi.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/platform/msm/gsi/gsi.c b/drivers/platform/msm/gsi/gsi.c
index 81b54bd38a2a..7b339e3643a6 100644
--- a/drivers/platform/msm/gsi/gsi.c
+++ b/drivers/platform/msm/gsi/gsi.c
@@ -581,7 +581,6 @@ static void gsi_process_chan(struct gsi_xfer_compl_evt *evt,
 	if (callback) {
 		if (unlikely(atomic_read(&ch_ctx->poll_mode))) {
 			GSIERR("Calling client callback in polling mode\n");
-			WARN_ON(1);
 		}
 		ch_ctx->props.xfer_cb(notify);
 	}

From 53ae6a505535c23bf67ebd5aa3eae6fc6c55edeb Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 1 Feb 2019 11:45:45 +0530
Subject: [PATCH 07/30] cpufreq: stats: Fix concurrency issues while resetting
 stats

It is possible for cpufreq_stats_clear_table() and
cpufreq_stats_record_transition() to get called concurrently and they
will try to update same variables simultaneously and may lead to
corruption of data.

Prevent that with the help of existing spinlock.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 291883ce664f..62ad56262df9 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -31,20 +31,20 @@ static void cpufreq_stats_update(struct cpufreq_stats *stats)
 {
 	unsigned long long cur_time = get_jiffies_64();
 
-	spin_lock(&cpufreq_stats_lock);
 	stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
 	stats->last_time = cur_time;
-	spin_unlock(&cpufreq_stats_lock);
 }
 
 static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
 {
 	unsigned int count = stats->max_state;
 
+	spin_lock(&cpufreq_stats_lock);
 	memset(stats->time_in_state, 0, count * sizeof(u64));
 	memset(stats->trans_table, 0, count * count * sizeof(int));
 	stats->last_time = get_jiffies_64();
 	stats->total_trans = 0;
+	spin_unlock(&cpufreq_stats_lock);
 }
 
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
@@ -62,7 +62,10 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	if (policy->fast_switch_enabled)
 		return 0;
 
+	spin_lock(&cpufreq_stats_lock);
 	cpufreq_stats_update(stats);
+	spin_unlock(&cpufreq_stats_lock);
+
 	for (i = 0; i < stats->state_num; i++) {
 		len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
 			(unsigned long long)
@@ -239,9 +242,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	if (unlikely(old_index == -1 || new_index == -1 || old_index == new_index))
 		return;
 
+	spin_lock(&cpufreq_stats_lock);
 	cpufreq_stats_update(stats);
 
 	stats->last_index = new_index;
 	stats->trans_table[old_index * stats->max_state + new_index]++;
 	stats->total_trans++;
+	spin_unlock(&cpufreq_stats_lock);
 }

From b828add3e575e5675b1ec4b8ae6504d1578924f4 Mon Sep 17 00:00:00 2001
From: Kyle Lin <linkyle0915@gmail.com>
Date: Tue, 9 Apr 2019 16:43:04 +0800
Subject: [PATCH 08/30] cpufreq: stats: Use lock by stat to replace global spin
 lock

Stats is updated by each policy, using the lock by stat can
reduce the contention.

Signed-off-by: Kyle Lin <linkyle0915@gmail.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 62ad56262df9..9ad15a9236ab 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 
-static DEFINE_SPINLOCK(cpufreq_stats_lock);
 
 struct cpufreq_stats {
 	unsigned int total_trans;
@@ -23,6 +22,7 @@ struct cpufreq_stats {
 	unsigned int state_num;
 	unsigned int last_index;
 	u64 *time_in_state;
+	spinlock_t lock;
 	unsigned int *freq_table;
 	unsigned int *trans_table;
 };
@@ -39,12 +39,12 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
 {
 	unsigned int count = stats->max_state;
 
-	spin_lock(&cpufreq_stats_lock);
+	spin_lock(&stats->lock);
 	memset(stats->time_in_state, 0, count * sizeof(u64));
 	memset(stats->trans_table, 0, count * count * sizeof(int));
 	stats->last_time = get_jiffies_64();
 	stats->total_trans = 0;
-	spin_unlock(&cpufreq_stats_lock);
+	spin_unlock(&stats->lock);
 }
 
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
@@ -62,9 +62,9 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	if (policy->fast_switch_enabled)
 		return 0;
 
-	spin_lock(&cpufreq_stats_lock);
+	spin_lock(&stats->lock);
 	cpufreq_stats_update(stats);
-	spin_unlock(&cpufreq_stats_lock);
+	spin_unlock(&stats->lock);
 
 	for (i = 0; i < stats->state_num; i++) {
 		len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
@@ -211,6 +211,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy)
 	stats->state_num = i;
 	stats->last_time = get_jiffies_64();
 	stats->last_index = freq_table_get_index(stats, policy->cur);
+	spin_lock_init(&stats->lock);
 
 	policy->stats = stats;
 	ret = sysfs_create_group(&policy->kobj, &stats_attr_group);
@@ -242,11 +243,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	if (unlikely(old_index == -1 || new_index == -1 || old_index == new_index))
 		return;
 
-	spin_lock(&cpufreq_stats_lock);
+	spin_lock(&stats->lock);
 	cpufreq_stats_update(stats);
 
 	stats->last_index = new_index;
 	stats->trans_table[old_index * stats->max_state + new_index]++;
 	stats->total_trans++;
-	spin_unlock(&cpufreq_stats_lock);
+	spin_unlock(&stats->lock);
 }

From c0036f28dcd1b3c5c8ac6bf4153069408d4526e3 Mon Sep 17 00:00:00 2001
From: kondors1995 <normandija1945@gmail.com>
Date: Fri, 23 Aug 2024 18:44:53 +0300
Subject: [PATCH 09/30] Revert "cpufreq: stats: Mark few conditionals with
 unlikely()"

This reverts commit 54a0849d7a6c52280bd28d66fb9a520ec004b561.
---
 drivers/cpufreq/cpufreq_stats.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 9ad15a9236ab..8f95b9fe257f 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -231,7 +231,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	struct cpufreq_stats *stats = policy->stats;
 	int old_index, new_index;
 
-	if (unlikely(!stats)) {
+	if (!stats) {
 		pr_debug("%s: No stats found\n", __func__);
 		return;
 	}
@@ -240,7 +240,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	new_index = freq_table_get_index(stats, new_freq);
 
 	/* We can't do stats->time_in_state[-1]= .. */
-	if (unlikely(old_index == -1 || new_index == -1 || old_index == new_index))
+	if (old_index == -1 || new_index == -1 || old_index == new_index)
 		return;
 
 	spin_lock(&stats->lock);

From 57a2f4a76463ab8dbc2315d78f6ae11b6a4cf254 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 5 Oct 2020 13:26:01 +0530
Subject: [PATCH 10/30] cpufreq: stats: Defer stats update to
 cpufreq_stats_record_transition()

In order to prepare for lock-less stats update, add support to defer any
updates to it until cpufreq_stats_record_transition() is called.

The stats were updated from two places earlier:

- show_time_in_state(): This can be easily deferred, all we need is to
  calculate the delta duration again in this routine to show the current
  state's time-in-state.

- store_reset(): This is a bit tricky as we need to clear the stats
  here and avoid races with simultaneous call to
  cpufreq_stats_record_transition().

Fix that by deferring the reset of the stats (within the code) to the
next call to cpufreq_stats_record_transition(), but since we need to
keep showing the right stats until that time, we capture the reset
time and account for the time since last time reset was called until
the time cpufreq_stats_record_transition() update the stats.

User space will continue seeing the stats correctly, everything will
be 0 after the stats are reset, apart from the time-in-state of the
current state, until the time a frequency switch happens.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Minor changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
 1 file changed, 56 insertions(+), 19 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 8f95b9fe257f..e1566dbea59f 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -25,17 +25,22 @@ struct cpufreq_stats {
 	spinlock_t lock;
 	unsigned int *freq_table;
 	unsigned int *trans_table;
+
+	/* Deferred reset */
+	unsigned int reset_pending;
+	unsigned long long reset_time;
 };
 
-static void cpufreq_stats_update(struct cpufreq_stats *stats)
+static void cpufreq_stats_update(struct cpufreq_stats *stats,
+				 unsigned long long time)
 {
 	unsigned long long cur_time = get_jiffies_64();
 
-	stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
+	stats->time_in_state[stats->last_index] += cur_time - time;
 	stats->last_time = cur_time;
 }
 
-static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
+static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
 {
 	unsigned int count = stats->max_state;
 
@@ -44,42 +49,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
 	memset(stats->trans_table, 0, count * count * sizeof(int));
 	stats->last_time = get_jiffies_64();
 	stats->total_trans = 0;
+
+	/* Adjust for the time elapsed since reset was requested */
+	WRITE_ONCE(stats->reset_pending, 0);
+	cpufreq_stats_update(stats, READ_ONCE(stats->reset_time));
 	spin_unlock(&stats->lock);
 }
 
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
 {
-	return sprintf(buf, "%d\n", policy->stats->total_trans);
+	struct cpufreq_stats *stats = policy->stats;
+
+	if (READ_ONCE(stats->reset_pending))
+		return sprintf(buf, "%d\n", 0);
+	else
+		return sprintf(buf, "%d\n", stats->total_trans);
 }
 cpufreq_freq_attr_ro(total_trans);
 
 static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 {
 	struct cpufreq_stats *stats = policy->stats;
+	bool pending = READ_ONCE(stats->reset_pending);
+	unsigned long long time;
 	ssize_t len = 0;
 	int i;
 
 	if (policy->fast_switch_enabled)
 		return 0;
 
-	spin_lock(&stats->lock);
-	cpufreq_stats_update(stats);
-	spin_unlock(&stats->lock);
-
 	for (i = 0; i < stats->state_num; i++) {
+		if (pending) {
+			if (i == stats->last_index)
+				time = get_jiffies_64() - READ_ONCE(stats->reset_time);
+			else
+				time = 0;
+		} else {
+			time = stats->time_in_state[i];
+			if (i == stats->last_index)
+				time += get_jiffies_64() - stats->last_time;
+		}
+
 		len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
-			(unsigned long long)
-			jiffies_64_to_clock_t(stats->time_in_state[i]));
+			       jiffies_64_to_clock_t(time));
 	}
 	return len;
 }
 cpufreq_freq_attr_ro(time_in_state);
 
+/* We don't care what is written to the attribute */
 static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
 			   size_t count)
 {
-	/* We don't care what is written to the attribute. */
-	cpufreq_stats_clear_table(policy->stats);
+	struct cpufreq_stats *stats = policy->stats;
+
+	/*
+	 * Defer resetting of stats to cpufreq_stats_record_transition() to
+	 * avoid races.
+	 */
+	WRITE_ONCE(stats->reset_time, get_jiffies_64());
+	WRITE_ONCE(stats->reset_pending, 1);
+
 	return count;
 }
 cpufreq_freq_attr_wo(reset);
@@ -87,8 +117,9 @@ cpufreq_freq_attr_wo(reset);
 static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 {
 	struct cpufreq_stats *stats = policy->stats;
+	bool pending = READ_ONCE(stats->reset_pending);
 	ssize_t len = 0;
-	int i, j;
+	int i, j, count;
 
 	if (policy->fast_switch_enabled)
 		return 0;
@@ -116,8 +147,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 		for (j = 0; j < stats->state_num; j++) {
 			if (len >= PAGE_SIZE)
 				break;
-			len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ",
-					stats->trans_table[i*stats->max_state+j]);
+
+			if (pending)
+				count = 0;
+			else
+				count = stats->trans_table[i * stats->max_state + j];
+
+			len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count);
 		}
 		if (len >= PAGE_SIZE)
 			break;
@@ -231,10 +267,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	struct cpufreq_stats *stats = policy->stats;
 	int old_index, new_index;
 
-	if (!stats) {
-		pr_debug("%s: No stats found\n", __func__);
+	if (!stats)
 		return;
-	}
+
+	if (unlikely(READ_ONCE(stats->reset_pending)))
+		cpufreq_stats_reset_table(stats);
 
 	old_index = stats->last_index;
 	new_index = freq_table_get_index(stats, new_freq);
@@ -244,7 +281,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 		return;
 
 	spin_lock(&stats->lock);
-	cpufreq_stats_update(stats);
+	cpufreq_stats_update(stats, stats->last_time);
 
 	stats->last_index = new_index;
 	stats->trans_table[old_index * stats->max_state + new_index]++;

From a4b305d240f5f4976c8bc771eb4572b056826bb8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 5 Oct 2020 13:26:02 +0530
Subject: [PATCH 11/30] cpufreq: stats: Remove locking

The locking isn't required anymore as stats can get updated only from
one place at a time. Remove it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index e1566dbea59f..e4bf444a2773 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -22,7 +22,6 @@ struct cpufreq_stats {
 	unsigned int state_num;
 	unsigned int last_index;
 	u64 *time_in_state;
-	spinlock_t lock;
 	unsigned int *freq_table;
 	unsigned int *trans_table;
 
@@ -44,7 +43,6 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
 {
 	unsigned int count = stats->max_state;
 
-	spin_lock(&stats->lock);
 	memset(stats->time_in_state, 0, count * sizeof(u64));
 	memset(stats->trans_table, 0, count * count * sizeof(int));
 	stats->last_time = get_jiffies_64();
@@ -53,7 +51,6 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
 	/* Adjust for the time elapsed since reset was requested */
 	WRITE_ONCE(stats->reset_pending, 0);
 	cpufreq_stats_update(stats, READ_ONCE(stats->reset_time));
-	spin_unlock(&stats->lock);
 }
 
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
@@ -247,7 +244,6 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy)
 	stats->state_num = i;
 	stats->last_time = get_jiffies_64();
 	stats->last_index = freq_table_get_index(stats, policy->cur);
-	spin_lock_init(&stats->lock);
 
 	policy->stats = stats;
 	ret = sysfs_create_group(&policy->kobj, &stats_attr_group);
@@ -280,11 +276,9 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	if (old_index == -1 || new_index == -1 || old_index == new_index)
 		return;
 
-	spin_lock(&stats->lock);
 	cpufreq_stats_update(stats, stats->last_time);
 
 	stats->last_index = new_index;
 	stats->trans_table[old_index * stats->max_state + new_index]++;
 	stats->total_trans++;
-	spin_unlock(&stats->lock);
 }

From 56541410c4135a5a1755d30392353165a8438b50 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 5 Oct 2020 13:26:03 +0530
Subject: [PATCH 12/30] cpufreq: stats: Mark few conditionals with unlikely()

Since this will be part of the scheduler's hotpath in some cases, use
unlikely() for few of the obvious conditionals.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index e4bf444a2773..a7af8abb1386 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -263,7 +263,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	struct cpufreq_stats *stats = policy->stats;
 	int old_index, new_index;
 
-	if (!stats)
+	if (unlikely(!stats))
 		return;
 
 	if (unlikely(READ_ONCE(stats->reset_pending)))
@@ -273,7 +273,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	new_index = freq_table_get_index(stats, new_freq);
 
 	/* We can't do stats->time_in_state[-1]= .. */
-	if (old_index == -1 || new_index == -1 || old_index == new_index)
+	if (unlikely(old_index == -1 || new_index == -1 || old_index == new_index))
 		return;
 
 	cpufreq_stats_update(stats, stats->last_time);

From 40577bcf39eb0cdfce209f7a8c94a7478d321643 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 5 Oct 2020 13:26:04 +0530
Subject: [PATCH 13/30] cpufreq: stats: Enable stats for fast-switch as well

Now that all the blockers are gone for enabling stats in fast-switching
case, enable it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[Kazuki: Port to v5.4]
Signed-off-by: Kazuki Hashimoto <kazukih@tuta.io>

# Conflicts:
#	drivers/cpufreq/cpufreq.c
---
 drivers/cpufreq/cpufreq.c       | 6 ++++--
 drivers/cpufreq/cpufreq_stats.c | 6 ------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index f319f7b4bd30..47ac18c107bd 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1870,9 +1870,11 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 	int ret;
 	target_freq = clamp_val(target_freq, policy->min, policy->max);
 
-        ret = cpufreq_driver->fast_switch(policy, target_freq);
-	if (ret)
+	ret = cpufreq_driver->fast_switch(policy, target_freq);
+	if (ret) {
 		cpufreq_times_record_transition(policy, ret);
+		cpufreq_stats_record_transition(policy, ret);
+	}
 
 	return ret;
 }
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index a7af8abb1386..7195069be26d 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -72,9 +72,6 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	ssize_t len = 0;
 	int i;
 
-	if (policy->fast_switch_enabled)
-		return 0;
-
 	for (i = 0; i < stats->state_num; i++) {
 		if (pending) {
 			if (i == stats->last_index)
@@ -118,9 +115,6 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 	ssize_t len = 0;
 	int i, j, count;
 
-	if (policy->fast_switch_enabled)
-		return 0;
-
 	len += scnprintf(buf + len, PAGE_SIZE - len, "   From  :    To\n");
 	len += scnprintf(buf + len, PAGE_SIZE - len, "         : ");
 	for (i = 0; i < stats->state_num; i++) {

From e3f2a5cb0120a983fcbe7e84b4666c40769fbf44 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 6 Oct 2020 21:43:43 +0200
Subject: [PATCH 14/30] cpufreq: stats: Add memory barrier to store_reset()

There is nothing to prevent the CPU or the compiler from reordering
the writes to stats->reset_time and stats->reset_pending in
store_reset(), in which case the readers of stats->reset_time may see
a stale value.  Moreover, on 32-bit arches the write to reset_time
cannot be completed in one go, so the readers of it may see a
partially updated value in that case.

To prevent that from happening, add a write memory barrier between
the writes to stats->reset_time and stats->reset_pending in
store_reset() and corresponding read memory barrier in the
readers of stats->reset_time.

Fixes: 40c3bd4cfa6f ("cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq_stats.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 7195069be26d..a9356fe5712d 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -50,6 +50,11 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
 
 	/* Adjust for the time elapsed since reset was requested */
 	WRITE_ONCE(stats->reset_pending, 0);
+	/*
+	 * Prevent the reset_time read from being reordered before the
+	 * reset_pending accesses in cpufreq_stats_record_transition().
+	 */
+	smp_rmb();
 	cpufreq_stats_update(stats, READ_ONCE(stats->reset_time));
 }
 
@@ -74,10 +79,16 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 
 	for (i = 0; i < stats->state_num; i++) {
 		if (pending) {
-			if (i == stats->last_index)
+			if (i == stats->last_index) {
+				/*
+				 * Prevent the reset_time read from occurring
+				 * before the reset_pending read above.
+				 */
+				smp_rmb();
 				time = get_jiffies_64() - READ_ONCE(stats->reset_time);
-			else
+			} else {
 				time = 0;
+			}
 		} else {
 			time = stats->time_in_state[i];
 			if (i == stats->last_index)
@@ -102,6 +113,11 @@ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
 	 * avoid races.
 	 */
 	WRITE_ONCE(stats->reset_time, get_jiffies_64());
+	/*
+	 * The memory barrier below is to prevent the readers of reset_time from
+	 * seeing a stale or partially updated value.
+	 */
+	smp_wmb();
 	WRITE_ONCE(stats->reset_pending, 1);
 
 	return count;

From 81c810045d4c815ed115fcf2c36686fd596951cb Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 12 Oct 2020 10:20:07 +0530
Subject: [PATCH 15/30] cpufreq: stats: Fix string format specifier mismatch

Fix following warning:

drivers/cpufreq/cpufreq_stats.c:63:10: warning: %d in format string (no.
1) requires 'int' but the argument type is 'unsigned int'

Fixes: 40c3bd4cfa6f ("cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index a9356fe5712d..ed7a715ba11f 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -65,7 +65,7 @@ static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
 	if (READ_ONCE(stats->reset_pending))
 		return sprintf(buf, "%d\n", 0);
 	else
-		return sprintf(buf, "%d\n", stats->total_trans);
+		return sprintf(buf, "%u\n", stats->total_trans);
 }
 cpufreq_freq_attr_ro(total_trans);
 

From 80ad6bacb0cbd722c02fe2b3789d62dffcd50ea5 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 17 Nov 2020 17:02:10 +0530
Subject: [PATCH 16/30] cpufreq: stats: Use local_clock() instead of jiffies

local_clock() has better precision and accuracy as compared to jiffies,
lets use it for time management in cpufreq stats.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index ed7a715ba11f..698cedca02d8 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -12,9 +12,9 @@
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/module.h>
+#include <linux/sched/clock.h>
 #include <linux/slab.h>
 
-
 struct cpufreq_stats {
 	unsigned int total_trans;
 	unsigned long long last_time;
@@ -33,7 +33,7 @@ struct cpufreq_stats {
 static void cpufreq_stats_update(struct cpufreq_stats *stats,
 				 unsigned long long time)
 {
-	unsigned long long cur_time = get_jiffies_64();
+	unsigned long long cur_time = local_clock();
 
 	stats->time_in_state[stats->last_index] += cur_time - time;
 	stats->last_time = cur_time;
@@ -45,7 +45,7 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
 
 	memset(stats->time_in_state, 0, count * sizeof(u64));
 	memset(stats->trans_table, 0, count * count * sizeof(int));
-	stats->last_time = get_jiffies_64();
+	stats->last_time = local_clock();
 	stats->total_trans = 0;
 
 	/* Adjust for the time elapsed since reset was requested */
@@ -85,18 +85,18 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 				 * before the reset_pending read above.
 				 */
 				smp_rmb();
-				time = get_jiffies_64() - READ_ONCE(stats->reset_time);
+				time = local_clock() - READ_ONCE(stats->reset_time);
 			} else {
 				time = 0;
 			}
 		} else {
 			time = stats->time_in_state[i];
 			if (i == stats->last_index)
-				time += get_jiffies_64() - stats->last_time;
+				time += local_clock() - stats->last_time;
 		}
 
 		len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
-			       jiffies_64_to_clock_t(time));
+			       nsec_to_clock_t(time));
 	}
 	return len;
 }
@@ -112,7 +112,7 @@ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
 	 * Defer resetting of stats to cpufreq_stats_record_transition() to
 	 * avoid races.
 	 */
-	WRITE_ONCE(stats->reset_time, get_jiffies_64());
+	WRITE_ONCE(stats->reset_time, local_clock());
 	/*
 	 * The memory barrier below is to prevent the readers of reset_time from
 	 * seeing a stale or partially updated value.
@@ -252,7 +252,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy)
 			stats->freq_table[i++] = pos->frequency;
 
 	stats->state_num = i;
-	stats->last_time = get_jiffies_64();
+	stats->last_time = local_clock();
 	stats->last_index = freq_table_get_index(stats, policy->cur);
 
 	policy->stats = stats;

From a0df1092950cc17db4e25ebad7d514e54f173745 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Mon, 31 May 2021 15:16:07 +0800
Subject: [PATCH 17/30] cpufreq: stats: Clean up local variable in
 cpufreq_stats_create_table()

Local variable 'count' will be initialized and 'ret' is also not
required, so remove the redundant initialization and get rid of
'ret'.

Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 698cedca02d8..bda35fe56b37 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -214,7 +214,7 @@ void cpufreq_stats_free_table(struct cpufreq_policy *policy)
 
 void cpufreq_stats_create_table(struct cpufreq_policy *policy)
 {
-	unsigned int i = 0, count = 0, ret = -ENOMEM;
+	unsigned int i = 0, count;
 	struct cpufreq_stats *stats;
 	unsigned int alloc_size;
 	struct cpufreq_frequency_table *pos;
@@ -256,8 +256,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy)
 	stats->last_index = freq_table_get_index(stats, policy->cur);
 
 	policy->stats = stats;
-	ret = sysfs_create_group(&policy->kobj, &stats_attr_group);
-	if (!ret)
+	if (!sysfs_create_group(&policy->kobj, &stats_attr_group))
 		return;
 
 	/* We failed, release resources */

From ef859d5eea151741860ac9e87d830638b943eb9e Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Fri, 15 Dec 2023 23:55:54 +0200
Subject: [PATCH 18/30] sched/cpufreq: Stop ignoring util updates

Eliminate the check for SCHED_CPUFREQ_WALT flag in
cpufreq_update_util(), update calling code to stop using that flag,
and replace its definition with a placeholder.

Test: Trace shows more frequent util updates
Bug: 110604715
Change-Id: I3b74e950b984194f08ecbbf91872467a200c9d1d
Signed-off-by: Connor O'Brien <connoro@google.com>

 Conflicts:
	kernel/sched/core.c
	kernel/sched/sched.h
	kernel/sched/walt.c

Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

# Conflicts:
#	kernel/sched/core.c
---
 include/linux/sched/cpufreq.h | 2 +-
 kernel/sched/core.c           | 7 ++-----
 kernel/sched/sched.h          | 2 --
 kernel/sched/walt.c           | 2 +-
 4 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 215e65da1be5..f794f8c31298 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -12,7 +12,7 @@
 #define SCHED_CPUFREQ_DL	(1U << 1)
 #define SCHED_CPUFREQ_IOWAIT	(1U << 2)
 #define SCHED_CPUFREQ_INTERCLUSTER_MIG (1U << 3)
-#define SCHED_CPUFREQ_WALT (1U << 4)
+#define SCHED_CPUFREQ_RESERVED	(1U << 4)
 #define SCHED_CPUFREQ_PL	(1U << 5)
 #define SCHED_CPUFREQ_EARLY_DET (1U << 6)
 #define SCHED_CPUFREQ_FORCE_UPDATE (1U << 7)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bf0a561abed0..fa4811170da3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3138,9 +3138,7 @@ out:
 	if (success && sched_predl) {
 		raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
 		if (do_pl_notif(cpu_rq(cpu)))
-			cpufreq_update_util(cpu_rq(cpu),
-					    SCHED_CPUFREQ_WALT |
-					    SCHED_CPUFREQ_PL);
+			cpufreq_update_util(cpu_rq(cpu), SCHED_CPUFREQ_PL);
 		raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
 	}
 #endif
@@ -4142,9 +4140,8 @@ void scheduler_tick(void)
 
 	early_notif = early_detection_notify(rq, wallclock);
 	if (early_notif)
-		flag = SCHED_CPUFREQ_WALT | SCHED_CPUFREQ_EARLY_DET;
+		cpufreq_update_util(rq, SCHED_CPUFREQ_EARLY_DET);
 
-	cpufreq_update_util(rq, flag);
 	rq_unlock(rq, &rf);
 
 	perf_event_task_tick();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55f476b2e201..d2fa46b12a42 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2760,8 +2760,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 	u64 clock;
 
 #ifdef CONFIG_SCHED_WALT
-	if (!(flags & SCHED_CPUFREQ_WALT))
-		return;
 	clock = sched_ktime_clock();
 #else
 	clock = rq_clock(rq);
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 2fedd010be1c..3577e39003f8 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -3220,7 +3220,7 @@ void walt_irq_work(struct irq_work *irq_work)
 						cpu_online_mask);
 		num_cpus = cpumask_weight(&cluster_online_cpus);
 		for_each_cpu(cpu, &cluster_online_cpus) {
-			int flag = SCHED_CPUFREQ_WALT;
+			int flag = 0;
 
 			rq = cpu_rq(cpu);
 

From fe902b1eb4a5a111bac1c2c59e59dd3bd3d94f43 Mon Sep 17 00:00:00 2001
From: kondors1995 <normandija1945@gmail.com>
Date: Sun, 25 Aug 2024 15:00:04 +0300
Subject: [PATCH 19/30] Squashed commit of the following:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 7004ae7dfb08c25dc5165abc447a70036b00b60e
Author: kondors1995 <normandija1945@gmail.com>
Date:   Fri Aug 16 17:06:30 2024 +0300

    arm64: dts: sm8150-v2: Update gpu votlages

commit 7d2152a68429cc4e44ec5e0d6b3340c7b3bd7130
Author: kondors1995 <normandija1945@gmail.com>
Date:   Thu Aug 15 17:46:10 2024 +0300

    sched:fair: fix unused warning

    not sure way its there but lets keep it

commit 37a33c25346a355a07b4900a864f6326740c3a61
Author: kondors1995 <normandija1945@gmail.com>
Date:   Thu Aug 15 17:37:29 2024 +0300

    arm64: dts: sm8150-v2: Fixup @2cc338b

commit ab938b00393e1a8e6a11b0f54d5c24d56f68a4de
Author: EmanuelCN <emanuelghub@gmail.com>
Date:   Sun Aug 11 14:38:41 2024 +0300

    drivers: gpu: drm: Do not affine pm qos requests to prime core

    *This was accidentally readded when i rebased techpack/display and drm.

commit 42966bed31891213f5bdb64be9f23aae0acb2c2d
Author: kondors1995 <normandija1945@gmail.com>
Date:   Wed Aug 14 10:56:36 2024 +0300

    raphael:defconfig: Disable CONFIG_FAIR_GROUP_SCHED

commit 4e1c9539727fc898016524700f5cced0cda9f55d
Author: EmanuelCN <emanuelghub@gmail.com>
Date:   Wed Aug 7 13:57:13 2024 +0300

    cpufreq: schedutil: Store the cached ratelimits values

    In a merge resolution google accidentally removed this, since we have checkouted the scheduler to redbull this part was missing.

commit 2dc779651497edbe71c74f4b41eab79b4b0e2368
Author: EmanuelCN <emanuelghub@gmail.com>
Date:   Tue Aug 6 20:24:40 2024 +0300

    cpufreq: schedutil: Set rate-limits globally

    Since we are not gonna modify them per cluster anymore its redundant to keep them this way, plus its less expensive.

commit 4ac25bd369c6ba572b582ea7099e42dfd7f7137f
Author: Xuewen Yan <xuewen.yan@unisoc.com>
Date:   Wed Jul 19 21:05:27 2023 +0800

    cpufreq: schedutil: Update next_freq when cpufreq_limits change

    When cpufreq's policy is 'single', there is a scenario that will
    cause sg_policy's next_freq to be unable to update.

    When the CPU's util is always max, the cpufreq will be max,
    and then if we change the policy's scaling_max_freq to be a
    lower freq, indeed, the sg_policy's next_freq need change to
    be the lower freq, however, because the cpu_is_busy, the next_freq
    would keep the max_freq.

    For example:

    The cpu7 is a single CPU:

      unisoc:/sys/devices/system/cpu/cpufreq/policy7 # while true;do done& [1] 4737
      unisoc:/sys/devices/system/cpu/cpufreq/policy7 # taskset -p 80 4737
      pid 4737's current affinity mask: ff
      pid 4737's new affinity mask: 80
      unisoc:/sys/devices/system/cpu/cpufreq/policy7 # cat scaling_max_freq
      2301000
      unisoc:/sys/devices/system/cpu/cpufreq/policy7 # cat scaling_cur_freq
      2301000
      unisoc:/sys/devices/system/cpu/cpufreq/policy7 # echo 2171000 > scaling_max_freq
      unisoc:/sys/devices/system/cpu/cpufreq/policy7 # cat scaling_max_freq
      2171000

    At this time, the sg_policy's next_freq would stay at 2301000, which
    is wrong.

    To fix this, add a check for the ->need_freq_update flag.

    [ mingo: Clarified the changelog. ]

    Co-developed-by: Guohua Yan <guohua.yan@unisoc.com>
    Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
    Signed-off-by: Guohua Yan <guohua.yan@unisoc.com>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Acked-by: "Rafael J. Wysocki" <rafael@kernel.org>
    Link: https://lore.kernel.org/r/20230719130527.8074-1-xuewen.yan@unisoc.com

commit e88732a5887bfaa8054408514f2af952e119a9c0
Author: kondors1995 <normandija1945@gmail.com>
Date:   Mon Aug 12 20:37:01 2024 +0300

    cpufreq: schedutil:remove walt bits

commit a19bfda67be888221f9348eff56a53ac3b34756a
Author: Dawei Li <daweilics@gmail.com>
Date:   Sun Aug 4 21:38:34 2024 +0300

    sched/fair: Fix initial util_avg calculation
    Change se->load.weight to se_weight(se) in the calculation for the
    initial util_avg to avoid unnecessarily inflating the util_avg by 1024
    times.

    The reason is that se->load.weight has the unit/scale as the scaled-up
    load, while cfs_rg->avg.load_avg has the unit/scale as the true task
    weight (as mapped directly from the task's nice/priority value). With
    CONFIG_32BIT, the scaled-up load is equal to the true task weight. With
    CONFIG_64BIT, the scaled-up load is 1024 times the true task weight.
    Thus, the current code may inflate the util_avg by 1024 times. The
    follow-up capping will not allow the util_avg value to go wild. But the
    calculation should have the correct logic.

    Signed-off-by: Dawei Li <daweilics@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Reviewed-by: Vishal Chourasia <vishalc@linux.ibm.com>
    Link: https://lore.kernel.org/r/20240315015916.21545-1-daweilics@gmail.com

commit b147bc04acfe5b5647ef50ea9af580aaed04894a
Author: EmanuelCN <emanuelghub@gmail.com>
Date:   Wed Aug 7 14:13:40 2024 +0300

    cpufreq: schedutil: Give 25% headroom to prime core

    If we don't give any headroom to prime core it can result in big performance regression

commit 010effdcd39496611bca9f284c32a5e554a4fd27
Author: EmanuelCN <emanuelghub@gmail.com>
Date:   Wed Jul 3 00:16:04 2024 +0300

    sched: Introduce Per-Cluster DVFS Headroom

    Typically, UI-demanding tasks are handled by little and big cores,
    while the prime core is more power-hungry and leads to unnecessary boosts,
    causing power wastage. From my observations, boosting the little cores
    provides the most significant performance improvement for UI tasks,
    followed by a modest boost to the big cores. This results in fewer
    stutters across the entire system.

    To avoid excessive boosting, the implementation ensures that when
    the utilization is equal to or higher than the maximum capacity value,
    util is returned as-is.

commit b72fefb30cfc63426415bdbf06d1bb4c87c4d32f
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Sun Jan 14 19:36:00 2024 +0100

    sched/fair: Fix frequency selection for non-invariant case

    Linus reported a ~50% performance regression on single-threaded
    workloads on his AMD Ryzen system, and bisected it to:

      9c0b4bb7f630 ("sched/cpufreq: Rework schedutil governor performance estimation")

    When frequency invariance is not enabled, get_capacity_ref_freq(policy)
    is supposed to return the current frequency and the performance margin
    applied by map_util_perf(), enabling the utilization to go above the
    maximum compute capacity and to select a higher frequency than the current one.

    After the changes in 9c0b4bb7f630, the performance margin was applied
    earlier in the path to take into account utilization clampings and
    we couldn't get a utilization higher than the maximum compute capacity,
    and the CPU remained 'stuck' at lower frequencies.

    To fix this, we must use a frequency above the current frequency to
    get a chance to select a higher OPP when the current one becomes fully used.
    Apply the same margin and return a frequency 25% higher than the current
    one in order to switch to the next OPP before we fully use the CPU
    at the current one.

    [ mingo: Clarified the changelog. ]

    Fixes: 9c0b4bb7f630 ("sched/cpufreq: Rework schedutil governor performance estimation")
    Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
    Bisected-by: Linus Torvalds <torvalds@linux-foundation.org>
    Reported-by: Wyes Karny <wkarny@gmail.com>
    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Tested-by: Wyes Karny <wkarny@gmail.com>
    Link: https://lore.kernel.org/r/20240114183600.135316-1-vincent.guittot@linaro.org

commit dcad748d1a2dcdb781fbcc514dc977d8ddc91be6
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Wed Nov 22 14:39:04 2023 +0100

    sched/cpufreq: Rework iowait boost

    Use the max value that has already been computed inside sugov_get_util()
    to cap the iowait boost and remove dependency with uclamp_rq_util_with()
    which is not used anymore.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Acked-by: Rafael J. Wysocki <rafael@kernel.org>
    Link: https://lore.kernel.org/r/20231122133904.446032-3-vincent.guittot@linaro.org

commit 31a906e3fcb3ba783d45157b722bcb3a321c1f8f
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Wed Nov 22 14:39:03 2023 +0100

    sched/cpufreq: Rework schedutil governor performance estimation

    The current method to take into account uclamp hints when estimating the
    target frequency can end in a situation where the selected target
    frequency is finally higher than uclamp hints, whereas there are no real
    needs. Such cases mainly happen because we are currently mixing the
    traditional scheduler utilization signal with the uclamp performance
    hints. By adding these 2 metrics, we loose an important information when
    it comes to select the target frequency, and we have to make some
    assumptions which can't fit all cases.

    Rework the interface between the scheduler and schedutil governor in order
    to propagate all information down to the cpufreq governor.

    effective_cpu_util() interface changes and now returns the actual
    utilization of the CPU with 2 optional inputs:

    - The minimum performance for this CPU; typically the capacity to handle
      the deadline task and the interrupt pressure. But also uclamp_min
      request when available.

    - The maximum targeting performance for this CPU which reflects the
      maximum level that we would like to not exceed. By default it will be
      the CPU capacity but can be reduced because of some performance hints
      set with uclamp. The value can be lower than actual utilization and/or
      min performance level.

    A new sugov_effective_cpu_perf() interface is also available to compute
    the final performance level that is targeted for the CPU, after applying
    some cpufreq headroom and taking into account all inputs.

    With these 2 functions, schedutil is now able to decide when it must go
    above uclamp hints. It now also has a generic way to get the min
    performance level.

    The dependency between energy model and cpufreq governor and its headroom
    policy doesn't exist anymore.

    eenv_pd_max_util() asks schedutil for the targeted performance after
    applying the impact of the waking task.

    [ mingo: Refined the changelog & C comments. ]

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Acked-by: Rafael J. Wysocki <rafael@kernel.org>
    Link: https://lore.kernel.org/r/20231122133904.446032-2-vincent.guittot@linaro.org

commit 307c28badda88662bfd2cf4afb4168506d1e8e96
Author: Lukasz Luba <lukasz.luba@arm.com>
Date:   Thu Dec 8 16:02:56 2022 +0000

    cpufreq, sched/util: Optimize operations with single CPU capacity lookup

    The max CPU capacity is the same for all CPUs sharing frequency domain.
    There is a way to avoid heavy operations in a loop for each CPU by
    leveraging this knowledge. Thus, simplify the looping code in the
    sugov_next_freq_shared() and drop heavy multiplications. Instead, use
    simple max() to get the highest utilization from these CPUs.

    This is useful for platforms with many (4 or 6) little CPUs. We avoid
    heavy 2*PD_CPU_NUM multiplications in that loop, which is called billions
    of times, since it's not limited by the schedutil time delta filter in
    sugov_should_update_freq(). When there was no need to change frequency
    the code bailed out, not updating the sg_policy::last_freq_update_time.
    Then every visit after delta_ns time longer than the
    sg_policy::freq_update_delay_ns goes through and triggers the next
    frequency calculation code. Although, if the next frequency, as outcome
    of that, would be the same as current frequency, we won't update the
    sg_policy::last_freq_update_time and the story will be repeated (in
    a very short period, sometimes a few microseconds).

    The max CPU capacity must be fetched every time we are called, due to
    difficulties during the policy setup, where we are not able to get the
    normalized CPU capacity at the right time.

    The fetched CPU capacity value is than used in sugov_iowait_apply() to
    calculate the right boost. This required a few changes in the local
    functions and arguments. The capacity value should hopefully be fetched
    once when needed and then passed over CPU registers to those functions.

    Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20221208160256.859-2-lukasz.luba@arm.com
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Patrick Bellasi <patrick.bellasi@arm.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>

commit db9c6be8f578a2decb289f5e2cce1af14819690e
Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Date:   Thu Mar 28 11:33:21 2019 +0100

    cpufreq: schedutil: Simplify iowait boosting

    There is not reason for the minimum iowait boost value in the
    schedutil cpufreq governor to depend on the available range of CPU
    frequencies.  In fact, that dependency is generally confusing,
    because it causes the iowait boost to behave somewhat differently
    on CPUs with the same maximum frequency and different minimum
    frequencies, for example.

    For this reason, replace the min field in struct sugov_cpu
    with a constant and choose its values to be 1/8 of
    SCHED_CAPACITY_SCALE (for consistency with the intel_pstate
    driver's internal governor).

    [Note that policy->cpuinfo.max_freq will not be a constant any more
     after a subsequent change, so this change is depended on by it.]

    Link: https://lore.kernel.org/lkml/20190305083202.GU32494@hirez.programming.kicks-ass.net/T/#ee20bdc98b7d89f6110c0d00e5c3ee8c2ced93c3d
    Suggested-by: Peter Zijlstra <peterz@infradead.org>
    Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
    Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

commit 7e1f2edaa9a3738cdf7bc23eaec0feb866633e3a
Author: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date:   Tue Jun 21 10:04:10 2022 +0100

    sched, drivers: Remove max param from effective_cpu_util()/sched_cpu_util()

    effective_cpu_util() already has a `int cpu' parameter which allows to
    retrieve the CPU capacity scale factor (or maximum CPU capacity) inside
    this function via an arch_scale_cpu_capacity(cpu).

    A lot of code calling effective_cpu_util() (or the shim
    sched_cpu_util()) needs the maximum CPU capacity, i.e. it will call
    arch_scale_cpu_capacity() already.
    But not having to pass it into effective_cpu_util() will make the EAS
    wake-up code easier, especially when the maximum CPU capacity reduced
    by the thermal pressure is passed through the EAS wake-up functions.

    Due to the asymmetric CPU capacity support of arm/arm64 architectures,
    arch_scale_cpu_capacity(int cpu) is a per-CPU variable read access via
    per_cpu(cpu_scale, cpu) on such a system.
    On all other architectures it is a a compile-time constant
    (SCHED_CAPACITY_SCALE).

    Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
    Tested-by: Lukasz Luba <lukasz.luba@arm.com>
    Link: https://lkml.kernel.org/r/20220621090414.433602-4-vdonnefort@google.com

commit caa24c3962bc2a002034548e3c93dce95c9330c4
Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Date:   Fri Jun 21 22:06:38 2024 +0300

    cpufreq: schedutil: Add util to struct sg_cpu
    Instead of passing util and max between functions while computing the
    utilization and capacity, store the former in struct sg_cpu (along
    with the latter and bw_dl).

    This will allow the current utilization value to be compared with the
    one obtained previously (which is requisite for some code changes to
    follow this one), but also it causes the code to look slightly more
    consistent and cleaner.

    Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
    Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

commit 176187b62d8637f44e8fcdee2b539dee8c30491b
Author: John Galt <johngaltfirstrun@gmail.com>
Date:   Thu Jun 27 21:52:34 2024 -0400

    sched/eevdf: 1ms base slice

commit c64c439cffc03bff0b5b69455dbd9018ae920a64
Author: John Galt <johngaltfirstrun@gmail.com>
Date:   Thu Jun 27 21:49:47 2024 -0400

    sched/features: enable SCHED_FEAT_PLACE_DEADLINE_INITIAL

commit 190d38b6481aea95d454dcfab5801a578a86f3dc
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Tue Apr 18 15:09:35 2023 +0100

    sched/uclamp: Fix fits_capacity() check in feec()

    commit 244226035a1f9b2b6c326e55ae5188fab4f428cb upstream.

    As reported by Yun Hsiang [1], if a task has its uclamp_min >= 0.8 * 1024,
    it'll always pick the previous CPU because fits_capacity() will always
    return false in this case.

    The new util_fits_cpu() logic should handle this correctly for us beside
    more corner cases where similar failures could occur, like when using
    UCLAMP_MAX.

    We open code uclamp_rq_util_with() except for the clamp() part,
    util_fits_cpu() needs the 'raw' values to be passed to it.

    Also introduce uclamp_rq_{set, get}() shorthand accessors to get uclamp
    value for the rq. Makes the code more readable and ensures the right
    rules (use READ_ONCE/WRITE_ONCE) are respected transparently.

    [1] https://lists.linaro.org/pipermail/eas-dev/2020-July/001488.html

    Fixes: 1d42509e475c ("sched/fair: Make EAS wakeup placement consider uclamp restrictions")
    Reported-by: Yun Hsiang <hsiang023167@gmail.com>
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lore.kernel.org/r/20220804143609.515789-4-qais.yousef@arm.com
    (cherry picked from commit 244226035a1f9b2b6c326e55ae5188fab4f428cb)
    [Fix trivial conflict in kernel/sched/fair.c due to new automatic
    variables in master vs 5.10]
    Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
    Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

commit a661b80985ae13d3ad1a1be93f482192e4a361b5
Author: Youssef Esmat <youssefesmat@chromium.org>
Date:   Thu Jun 20 19:26:56 2024 +0300

    sched/eevdf: Change base_slice to 3ms
    The default base slice of 0.75 msec is causing excessive context
    switches. Raise it to 3 msecs, which lowers the amount of context
    switches and the added overhead of the scheduler.

    BUG=b:308209790
    TEST=Manual tested on DUT
    UPSTREAM-TASK=b:324602237

    Change-Id: I7c7a7ecb377933a4032eb11161f5b5873e3e1e4c
    Signed-off-by: Youssef Esmat <youssefesmat@chromium.org>
    Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/5291506
    Reviewed-by: Suleiman Souhlal <suleiman@google.com>
    Reviewed-by: Vineeth Pillai <vineethrp@google.com>
    Reviewed-by: Steven Rostedt <rostedt@google.com>

commit 7d18cd0125c2a4347268d2f72b53a9045937d98a
Author: Youssef Esmat <youssefesmat@chromium.org>
Date:   Thu Jun 20 19:26:07 2024 +0300

    sched/eevdf: Add ENFORCE_ELIGIBILITY and default settings
    Eligibility reduces scheduling latency of high nice task at the
    cost of runtime of low nice tasks. This hurts the chrome model where
    UI tasks should not be interrupted by background tasks. This adds
    a feature to be able to disable eligibility checks.

    Note that fairness is presvered in the way the deadline is calculated
    since the deadline is adjusted by the weight of the task and a task.

    Disabling lag placement is similar to eligibility since lag will
    adjust the CPU time based on the amount of runtime.

    BUG=b:308209790
    TEST=Manual tested on DUT
    UPSTREAM-TASK=b:324602237
    Signed-off-by: Youssef Esmat <youssefesmat@chromium.org>

commit 6ef762984070d49371eb803bf50aeacfebe51613
Author: Aaron Lu <aaron.lu@intel.com>
Date:   Tue Sep 12 14:58:08 2023 +0800

    sched/fair: Ratelimit update to tg->load_avg

    When using sysbench to benchmark Postgres in a single docker instance
    with sysbench's nr_threads set to nr_cpu, it is observed there are times
    update_cfs_group() and update_load_avg() shows noticeable overhead on
    a 2sockets/112core/224cpu Intel Sapphire Rapids(SPR):

        13.75%    13.74%  [kernel.vmlinux]           [k] update_cfs_group
        10.63%    10.04%  [kernel.vmlinux]           [k] update_load_avg

    Annotate shows the cycles are mostly spent on accessing tg->load_avg
    with update_load_avg() being the write side and update_cfs_group() being
    the read side. tg->load_avg is per task group and when different tasks
    of the same taskgroup running on different CPUs frequently access
    tg->load_avg, it can be heavily contended.

    E.g. when running postgres_sysbench on a 2sockets/112cores/224cpus Intel
    Sappire Rapids, during a 5s window, the wakeup number is 14millions and
    migration number is 11millions and with each migration, the task's load
    will transfer from src cfs_rq to target cfs_rq and each change involves
    an update to tg->load_avg. Since the workload can trigger as many wakeups
    and migrations, the access(both read and write) to tg->load_avg can be
    unbound. As a result, the two mentioned functions showed noticeable
    overhead. With netperf/nr_client=nr_cpu/UDP_RR, the problem is worse:
    during a 5s window, wakeup number is 21millions and migration number is
    14millions; update_cfs_group() costs ~25% and update_load_avg() costs ~16%.

    Reduce the overhead by limiting updates to tg->load_avg to at most once
    per ms. The update frequency is a tradeoff between tracking accuracy and
    overhead. 1ms is chosen because PELT window is roughly 1ms and it
    delivered good results for the tests that I've done. After this change,
    the cost of accessing tg->load_avg is greatly reduced and performance
    improved. Detailed test results below.

      ==============================
      postgres_sysbench on SPR:
      25%
      base:   42382±19.8%
      patch:  50174±9.5%  (noise)

      50%
      base:   67626±1.3%
      patch:  67365±3.1%  (noise)

      75%
      base:   100216±1.2%
      patch:  112470±0.1% +12.2%

      100%
      base:    93671±0.4%
      patch:  113563±0.2% +21.2%

      ==============================
      hackbench on ICL:
      group=1
      base:    114912±5.2%
      patch:   117857±2.5%  (noise)

      group=4
      base:    359902±1.6%
      patch:   361685±2.7%  (noise)

      group=8
      base:    461070±0.8%
      patch:   491713±0.3% +6.6%

      group=16
      base:    309032±5.0%
      patch:   378337±1.3% +22.4%

      =============================
      hackbench on SPR:
      group=1
      base:    100768±2.9%
      patch:   103134±2.9%  (noise)

      group=4
      base:    413830±12.5%
      patch:   378660±16.6% (noise)

      group=8
      base:    436124±0.6%
      patch:   490787±3.2% +12.5%

      group=16
      base:    457730±3.2%
      patch:   680452±1.3% +48.8%

      ============================
      netperf/udp_rr on ICL
      25%
      base:    114413±0.1%
      patch:   115111±0.0% +0.6%

      50%
      base:    86803±0.5%
      patch:   86611±0.0%  (noise)

      75%
      base:    35959±5.3%
      patch:   49801±0.6% +38.5%

      100%
      base:    61951±6.4%
      patch:   70224±0.8% +13.4%

      ===========================
      netperf/udp_rr on SPR
      25%
      base:   104954±1.3%
      patch:  107312±2.8%  (noise)

      50%
      base:    55394±4.6%
      patch:   54940±7.4%  (noise)

      75%
      base:    13779±3.1%
      patch:   36105±1.1% +162%

      100%
      base:     9703±3.7%
      patch:   28011±0.2% +189%

      ==============================================
      netperf/tcp_stream on ICL (all in noise range)
      25%
      base:    43092±0.1%
      patch:   42891±0.5%

      50%
      base:    19278±14.9%
      patch:   22369±7.2%

      75%
      base:    16822±3.0%
      patch:   17086±2.3%

      100%
      base:    18216±0.6%
      patch:   18078±2.9%

      ===============================================
      netperf/tcp_stream on SPR (all in noise range)
      25%
      base:    34491±0.3%
      patch:   34886±0.5%

      50%
      base:    19278±14.9%
      patch:   22369±7.2%

      75%
      base:    16822±3.0%
      patch:   17086±2.3%

      100%
      base:    18216±0.6%
      patch:   18078±2.9%

    Reported-by: Nitin Tekchandani <nitin.tekchandani@intel.com>
    Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Aaron Lu <aaron.lu@intel.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
    Reviewed-by: David Vernet <void@manifault.com>
    Tested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
    Tested-by: Swapnil Sapkal <Swapnil.Sapkal@amd.com>
    Link: https://lkml.kernel.org/r/20230912065808.2530-2-aaron.lu@intel.com

commit c0c01cb55fa02d91f3a9f1fd456ecc6842990f99
Author: Chengming Zhou <zhouchengming@bytedance.com>
Date:   Thu Aug 18 20:48:02 2022 +0800

    sched/fair: Fix another detach on unattached task corner case

    commit 7dc603c9028e ("sched/fair: Fix PELT integrity for new tasks")
    fixed two load tracking problems for new task, including detach on
    unattached new task problem.

    There still left another detach on unattached task problem for the task
    which has been woken up by try_to_wake_up() and waiting for actually
    being woken up by sched_ttwu_pending().

    try_to_wake_up(p)
      cpu = select_task_rq(p)
      if (task_cpu(p) != cpu)
        set_task_cpu(p, cpu)
          migrate_task_rq_fair()
            remove_entity_load_avg()       --> unattached
            se->avg.last_update_time = 0;
          __set_task_cpu()
      ttwu_queue(p, cpu)
        ttwu_queue_wakelist()
          __ttwu_queue_wakelist()

    task_change_group_fair()
      detach_task_cfs_rq()
        detach_entity_cfs_rq()
          detach_entity_load_avg()   --> detach on unattached task
      set_task_rq()
      attach_task_cfs_rq()
        attach_entity_cfs_rq()
          attach_entity_load_avg()

    The reason of this problem is similar, we should check in detach_entity_cfs_rq()
    that se->avg.last_update_time != 0, before do detach_entity_load_avg().

    Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Link: https://lore.kernel.org/r/20220818124805.601-7-zhouchengming@bytedance.com

commit 60cfd4b1c31915161d1c942a4a9067314fba8d79
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Mon May 6 23:43:57 2024 -0700

    sched/cass: Honor uclamp even when no CPUs can satisfy the requirement

    When all CPUs available to a uclamp'd process are thermal throttled, it is
    possible for them to be throttled below the uclamp minimum requirement. In
    this case, CASS only considers uclamp when it compares relative utilization
    and nowhere else; i.e., CASS essentially ignores the most important aspect
    of uclamp.

    Fix it so that CASS tries to honor uclamp even when no CPUs available to a
    uclamp'd process are capable of fully meeting the uclamp minimum.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

commit 82d9f4952ab48ec0b1dd5cd48da1c3619ceb01c9
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Mon May 6 20:14:08 2024 -0700

    sched/cass: Fix disproportionate load spreading when CPUs are throttled

    When CPUs are thermal throttled, CASS tries to spread load such that their
    resulting P-state is scaled relatively to their _throttled_ maximum
    capacity, rather than their original capacity.

    As a result, throttled CPUs are unfairly under-utilized, causing other CPUs
    to receive the extra burden and thus run at a disproportionately higher
    P-state relative to the throttled CPUs. This not only hurts performance,
    but also greatly diminishes energy efficiency since it breaks CASS's basic
    load balancing principle.

    To fix this, some convoluted logic is required in order to make CASS aware
    of a CPU's throttled and non-throttled capacity. The non-throttled capacity
    is used for the fundamental relative utilization comparison, while the
    throttled capacity is used in conjunction to ensure a throttled CPU isn't
    accidentally overloaded as a result.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

commit 6341ff1997878ad3b6b90a5bf0e20b99aa9e6d89
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Tue Dec 18 10:31:30 2018 +0000

    ANDROID: sched/fair: EAS: Add uclamp support to find_best_target()

    Utilization clamping can be used to boost the utilization of small tasks
    or cap that of big tasks. Thus, one of its possible usages is to bias
    tasks placement to "promote" small tasks on higher capacity (less energy
    efficient) CPUs or "constraint" big tasks on small capacity (more energy
    efficient) CPUs.

    When the Energy Aware Scheduler (EAS) looks for the most energy
    efficient CPU to run a task on, it currently considers only the
    effective utiliation estimated for a task.

    Fix this by adding an additional check to skip CPUs which capacity is
    smaller then the task clamped utilization.

    Change-Id: I43fa6fa27e27c1eb5272c6a45d1a6a5b0faae1aa
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Cc: Ingo Molnar <mingo@redhat.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Signed-off-by: Quentin Perret <quentin.perret@arm.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit c0ccb5593e4c2ac72758c5a0f68e274a5698a839
Author: EmanuelCN <emanuelghub@gmail.com>
Date:   Mon Apr 8 21:44:58 2024 +0300

    sched/eevdf: Use the other placement strategy

    Place lag is currently broken (4.19 needs more backports to make it work as intended)

    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 052afe14ea97a104e3c384f56405c30fb571efe9
Author: Xuewen Yan <xuewen.yan@unisoc.com>
Date:   Mon Apr 22 16:22:38 2024 +0800

    sched/eevdf: Prevent vlag from going out of bounds in reweight_eevdf()

    It was possible to have pick_eevdf() return NULL, which then causes a
    NULL-deref. This turned out to be due to entity_eligible() returning
    falsely negative because of a s64 multiplcation overflow.

    Specifically, reweight_eevdf() computes the vlag without considering
    the limit placed upon vlag as update_entity_lag() does, and then the
    scaling multiplication (remember that weight is 20bit fixed point) can
    overflow. This then leads to the new vruntime being weird which then
    causes the above entity_eligible() to go side-ways and claim nothing
    is eligible.

    Thus limit the range of vlag accordingly.

    All this was quite rare, but fatal when it does happen.

    Closes: https://lore.kernel.org/all/ZhuYyrh3mweP_Kd8@nz.home/
    Closes: https://lore.kernel.org/all/CA+9S74ih+45M_2TPUY_mPPVDhNvyYfy1J1ftSix+KjiTVxg8nw@mail.gmail.com/
    Closes: https://lore.kernel.org/lkml/202401301012.2ed95df0-oliver.sang@intel.com/
    Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight")
    Reported-by: Sergei Trofimovich <slyich@gmail.com>
    Reported-by: Igor Raits <igor@gooddata.com>
    Reported-by: Breno Leitao <leitao@debian.org>
    Reported-by: kernel test robot <oliver.sang@intel.com>
    Reported-by: Yujie Liu <yujie.liu@intel.com>
    Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
    Reviewed-and-tested-by: Chen Yu <yu.c.chen@intel.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lore.kernel.org/r/20240422082238.5784-1-xuewen.yan@unisoc.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit e224fc314c4b2f28785f30f8c85af7c566d540b3
Author: Tianchen Ding <dtcccc@linux.alibaba.com>
Date:   Wed Mar 6 10:21:33 2024 +0800

    sched/eevdf: Fix miscalculation in reweight_entity() when se is not curr

    reweight_eevdf() only keeps V unchanged inside itself. When se !=
    cfs_rq->curr, it would be dequeued from rb tree first. So that V is
    changed and the result is wrong. Pass the original V to reweight_eevdf()
    to fix this issue.

    Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight")
    Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
    [peterz: flip if() condition for clarity]
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Abel Wu <wuyun.abel@bytedance.com>
    Link: https://lkml.kernel.org/r/20240306022133.81008-3-dtcccc@linux.alibaba.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 6f605d6768465bec0f16cead98e1b3b08e548afe
Author: Tianchen Ding <dtcccc@linux.alibaba.com>
Date:   Wed Mar 6 10:21:32 2024 +0800

    sched/eevdf: Always update V if se->on_rq when reweighting

    reweight_eevdf() needs the latest V to do accurate calculation for new
    ve and vd. So update V unconditionally when se is runnable.

    Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight")
    Suggested-by: Abel Wu <wuyun.abel@bytedance.com>
    Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Abel Wu <wuyun.abel@bytedance.com>
    Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
    Tested-by: Chen Yu <yu.c.chen@intel.com>
    Link: https://lore.kernel.org/r/20240306022133.81008-2-dtcccc@linux.alibaba.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 3416506799b6b2b3eab0688051082ed2fc1d1f5b
Author: Wang Jinchao <wangjinchao@xfusion.com>
Date:   Thu Dec 14 13:20:29 2023 +0800

    sched/fair: Remove unused 'next_buddy_marked' local variable in check_preempt_wakeup_fair()

    This variable became unused in:

        5e963f2bd465 ("sched/fair: Commit to EEVDF")

    Signed-off-by: Wang Jinchao <wangjinchao@xfusion.com>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Link: https://lore.kernel.org/r/202312141319+0800-wangjinchao@xfusion.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 2cfaa805dc73ae4e6395a8aa69ee79b9580cc7bb
Author: Yiwei Lin <s921975628@gmail.com>
Date:   Fri Nov 17 16:01:06 2023 +0800

    sched/fair: Update min_vruntime for reweight_entity() correctly

    Since reweight_entity() may have chance to change the weight of
    cfs_rq->curr entity, we should also update_min_vruntime() if
    this is the case

    Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight")
    Signed-off-by: Yiwei Lin <s921975628@gmail.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Abel Wu <wuyun.abel@bytedance.com>
    Link: https://lore.kernel.org/r/20231117080106.12890-1-s921975628@gmail.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 13ffac20f571298c2007b90674694aebe367be89
Author: Abel Wu <wuyun.abel@bytedance.com>
Date:   Wed Nov 15 11:36:46 2023 +0800

    sched/eevdf: O(1) fastpath for task selection

    Since the RB-tree is now sorted by deadline, let's first try the
    leftmost entity which has the earliest virtual deadline. I've done
    some benchmarks to see its effectiveness.

    All the benchmarks are done inside a normal cpu cgroup in a clean
    environment with cpu turbo disabled, on a dual-CPU Intel Xeon(R)
    Platinum 8260 with 2 NUMA nodes each of which has 24C/48T.

      hackbench: process/thread + pipe/socket + 1/2/4/8 groups
      netperf:   TCP/UDP + STREAM/RR + 24/48/72/96/192 threads
      tbench:    loopback 24/48/72/96/192 threads
      schbench:  1/2/4/8 mthreads

      direct:    cfs_rq has only one entity
      parity:    RUN_TO_PARITY
      fast:      O(1) fastpath
      slow:	     heap search

        (%)		direct	parity	fast	slow
      hackbench	92.95	2.02	4.91	0.12
      netperf	68.08	6.60	24.18	1.14
      tbench	67.55	11.22	20.61	0.62
      schbench	69.91	2.65	25.73	1.71

    The above results indicate that this fastpath really makes task
    selection more efficient.

    Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20231115033647.80785-4-wuyun.abel@bytedance.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 010771bc9b9ab17c289743afe1241127ba5263df
Author: Abel Wu <wuyun.abel@bytedance.com>
Date:   Wed Nov 15 11:36:45 2023 +0800

    sched/eevdf: Sort the rbtree by virtual deadline

    Sort the task timeline by virtual deadline and keep the min_vruntime
    in the augmented tree, so we can avoid doubling the worst case cost
    and make full use of the cached leftmost node to enable O(1) fastpath
    picking in next patch.

    Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20231115033647.80785-3-wuyun.abel@bytedance.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 1e35ae32e182db3531da5d9055e45c7ccf826c96
Author: Abel Wu <wuyun.abel@bytedance.com>
Date:   Tue Nov 21 21:44:26 2023 +0200

    sched/eevdf: Fix vruntime adjustment on reweight

    vruntime of the (on_rq && !0-lag) entity needs to be adjusted when
    it gets re-weighted, and the calculations can be simplified based
    on the fact that re-weight won't change the w-average of all the
    entities. Please check the proofs in comments.

    But adjusting vruntime can also cause position change in RB-tree
    hence require re-queue to fix up which might be costly. This might
    be avoided by deferring adjustment to the time the entity actually
    leaves tree (dequeue/pick), but that will negatively affect task
    selection and probably not good enough either.

    Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
    Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20231107090510.71322-2-wuyun.abel@bytedance.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 73fa83e38da16aad4009b37fe7ee7a9fb289adb9
Author: Yiwei Lin <s921975628@gmail.com>
Date:   Fri Oct 20 13:56:17 2023 +0800

    sched/fair: Remove unused 'curr' argument from pick_next_entity()

    The 'curr' argument of pick_next_entity() has become unused after
    the EEVDF changes.

    [ mingo: Updated the changelog. ]

    Signed-off-by: Yiwei Lin <s921975628@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20231020055617.42064-1-s921975628@gmail.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit d2d81ece57c3c14431dc2525f551e2d2eadb2834
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Tue Oct 17 16:59:47 2023 +0200

    sched/eevdf: Fix heap corruption more

    Because someone is a flaming idiot... and forgot we have current as
    se->on_rq but not actually in the tree itself, and walking rb_parent()
    on an entry not in the tree is 'funky' and KASAN complains.

    Fixes: 8dafa9d0eb1a ("sched/eevdf: Fix min_deadline heap integrity")
    Reported-by: 0599jiangyc@gmail.com
    Reported-by: Dmitry Safonov <0x7f454c46@gmail.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Tested-by: Dmitry Safonov <0x7f454c46@gmail.com>
    Link: https://bugzilla.kernel.org/show_bug.cgi?id=218020
    Link: https://lkml.kernel.org/r/CAJwJo6ZGXO07%3DQvW4fgQfbsDzQPs9xj5sAQ1zp%3DmAyPMNbHYww%40mail.gmail.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit b97630c04b5e7869bc0a5900e6e7dda8f7bcf5d3
Author: Benjamin Segall <bsegall@google.com>
Date:   Fri Sep 29 17:09:30 2023 -0700

    sched/eevdf: Fix pick_eevdf()

    The old pick_eevdf() could fail to find the actual earliest eligible
    deadline when it descended to the right looking for min_deadline, but
    it turned out that that min_deadline wasn't actually eligible. In that
    case we need to go back and search through any left branches we
    skipped looking for the actual best _eligible_ min_deadline.

    This is more expensive, but still O(log n), and at worst should only
    involve descending two branches of the rbtree.

    I've run this through a userspace stress test (thank you
    tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
    corner cases.

    Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
    Signed-off-by: Ben Segall <bsegall@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/xm261qego72d.fsf_-_@google.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit df45b4b6a01c84ccb6ac72b0d5d20da7c90572d8
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Fri Oct 6 21:24:45 2023 +0200

    sched/eevdf: Fix min_deadline heap integrity

    Marek and Biju reported instances of:

      "EEVDF scheduling fail, picking leftmost"

    which Mike correlated with cgroup scheduling and the min_deadline heap
    getting corrupted; some trace output confirms:

    > And yeah, min_deadline is hosed somehow:
    >
    >    validate_cfs_rq: --- /
    >    __print_se: ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -66302085 E (11372/tr)
    >    __print_se:   ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -66302085 E (-1//autogroup-31)
    >    __print_se:   ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N (-1//autogroup-33)
    >    validate_cfs_rq: min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256

    Turns out that reweight_entity(), which tries really hard to be fast,
    does not do the normal dequeue+update+enqueue pattern but *does* scale
    the deadline.

    However, it then fails to propagate the updated deadline value up the
    heap.

    Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
    Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
    Reported-by: Biju Das <biju.das.jz@bp.renesas.com>
    Reported-by: Mike Galbraith <efault@gmx.de>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
    Tested-by: Biju Das <biju.das.jz@bp.renesas.com>
    Tested-by: Mike Galbraith <efault@gmx.de>
    Link: https://lkml.kernel.org/r/20231006192445.GE743@noisy.programming.kicks-ass.net
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 3a51a2af114512733bfe3b6965a7320af592ac6e
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Tue Sep 26 14:29:50 2023 +0200

    sched/eevdf: Fix avg_vruntime()

    The expectation is that placing a task at avg_vruntime() makes it
    eligible. Turns out there is a corner case where this is not the case.

    Specifically, avg_vruntime() relies on the fact that integer division
    is a flooring function (eg. it discards the remainder). By this
    property the value returned is slightly left of the true average.

    However! when the average is a negative (relative to min_vruntime) the
    effect is flipped and it becomes a ceil, with the result that the
    returned value is just right of the average and thus not eligible.

    Fixes: af4cf40470c2 ("sched/fair: Add cfs_rq::avg_vruntime")
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 94630ff1c5dace213fc4d886abdddff610331975
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Fri Sep 15 00:48:55 2023 +0200

    sched/eevdf: Also update slice on placement

    Tasks that never consume their full slice would not update their slice value.
    This means that tasks that are spawned before the sysctl scaling keep their
    original (UP) slice length.

    Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20230915124822.847197830@noisy.programming.kicks-ass.net
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit ebdbfa17b89f097f7dbb25126033767d392ec08a
Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date:   Wed Sep 20 15:00:24 2023 +0200

    sched/debug: Remove the /proc/sys/kernel/sched_child_runs_first sysctl

    The /proc/sys/kernel/sched_child_runs_first knob is no longer connected since:

       5e963f2bd4654 ("sched/fair: Commit to EEVDF")

    Remove it.

    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230920130025.412071-2-bigeasy@linutronix.de
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit f3c2be38e880f64190b90cfa7a587e224036fa58
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed Aug 16 15:40:59 2023 +0200

    sched/eevdf: Curb wakeup-preemption

    Mike and others noticed that EEVDF does like to over-schedule quite a
    bit -- which does hurt performance of a number of benchmarks /
    workloads.

    In particular, what seems to cause over-scheduling is that when lag is
    of the same order (or larger) than the request / slice then placement
    will not only cause the task to be placed left of current, but also
    with a smaller deadline than current, which causes immediate
    preemption.

    [ notably, lag bounds are relative to HZ ]

    Mike suggested we stick to picking 'current' for as long as it's
    eligible to run, giving it uninterrupted runtime until it reaches
    parity with the pack.

    Augment Mike's suggestion by only allowing it to exhaust it's initial
    request.

    One random data point:

    echo NO_RUN_TO_PARITY > /debug/sched/features
    perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000

    	3,723,554        context-switches      ( +-  0.56% )
    	9.5136 +- 0.0394 seconds time elapsed  ( +-  0.41% )

    echo RUN_TO_PARITY > /debug/sched/features
    perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000

    	2,556,535        context-switches      ( +-  0.51% )
    	9.2427 +- 0.0302 seconds time elapsed  ( +-  0.33% )

    Suggested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit bb48e3bf106a493dad3be6f2a84f507d7a5d0202
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed May 31 13:58:49 2023 +0200

    sched/fair: Propagate enqueue flags into place_entity()

    This allows place_entity() to consider ENQUEUE_WAKEUP and
    ENQUEUE_MIGRATED.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230531124604.274010996@infradead.org
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 498bfeceabaf3803261f7c934e4303d6f2137ccc
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed May 31 13:58:48 2023 +0200

    sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice

    EEVDF uses this tunable as the base request/slice -- make sure the
    name reflects this.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230531124604.205287511@infradead.org
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit b194d2bfc5dc8070c78357d0c2b63a0cde2877d6
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Tue Nov 21 21:21:06 2023 +0200

    sched/fair: Commit to EEVDF

    EEVDF is a better defined scheduling policy, as a result it has less
    heuristics/tunables. There is no compelling reason to keep CFS around.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230531124604.137187212@infradead.org
    [@Helium-Studio: Also remove sysctl entries that dropped by this commit]
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

    # Conflicts:
    #	kernel/sched/fair.c
    #	kernel/sysctl.c

commit 29c0c9b70590c86dbb3504704b4043ffe793d48f
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed May 31 13:58:46 2023 +0200

    sched/smp: Use lag to simplify cross-runqueue placement

    Using lag is both more correct and simpler when moving between
    runqueues.

    Notable, min_vruntime() was invented as a cheap approximation of
    avg_vruntime() for this very purpose (SMP migration). Since we now
    have the real thing; use it.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230531124604.068911180@infradead.org
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 241fa4e083655b4f67cae56fa716ca6234506b30
Author: Chengming Zhou <zhouchengming@bytedance.com>
Date:   Thu Aug 18 20:48:01 2022 +0800

    sched/fair: Combine detach into dequeue when migrating task

    When we are migrating task out of the CPU, we can combine detach and
    propagation into dequeue_entity() to save the detach_entity_cfs_rq()
    in migrate_task_rq_fair().

    This optimization is like combining DO_ATTACH in the enqueue_entity()
    when migrating task to the CPU. So we don't have to traverse the CFS tree
    extra time to do the detach_entity_cfs_rq() -> propagate_entity_cfs_rq(),
    which wouldn't be called anymore with this patch's change.

    detach_task()
      deactivate_task()
        dequeue_task_fair()
          for_each_sched_entity(se)
            dequeue_entity()
              update_load_avg() /* (1) */
                detach_entity_load_avg()

      set_task_cpu()
        migrate_task_rq_fair()
          detach_entity_cfs_rq() /* (2) */
            update_load_avg();
            detach_entity_load_avg();
            propagate_entity_cfs_rq();
              for_each_sched_entity()
                update_load_avg()

    This patch save the detach_entity_cfs_rq() called in (2) by doing
    the detach_entity_load_avg() for a CPU migrating task inside (1)
    (the task being the first se in the loop)

    Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Link: https://lore.kernel.org/r/20220818124805.601-6-zhouchengming@bytedance.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit d563959f4820312d3d1b462c78ca62a8a5fe3157
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed May 31 13:58:45 2023 +0200

    sched/fair: Commit to lag based placement

    Removes the FAIR_SLEEPERS code in favour of the new LAG based
    placement.

    Specifically, the whole FAIR_SLEEPER thing was a very crude
    approximation to make up for the lack of lag based placement,
    specifically the 'service owed' part. This is important for things
    like 'starve' and 'hackbench'.

    One side effect of FAIR_SLEEPER is that it caused 'small' unfairness,
    specifically, by always ignoring up-to 'thresh' sleeptime it would
    have a 50%/50% time distribution for a 50% sleeper vs a 100% runner,
    while strictly speaking this should (of course) result in a 33%/67%
    split (as CFS will also do if the sleep period exceeds 'thresh').

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230531124604.000198861@infradead.org
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit ac875e302647f7266ce5678d02f06732e4512f2c
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed May 31 13:58:44 2023 +0200

    sched/fair: Implement an EEVDF-like scheduling policy

    Where CFS is currently a WFQ based scheduler with only a single knob,
    the weight. The addition of a second, latency oriented parameter,
    makes something like WF2Q or EEVDF based a much better fit.

    Specifically, EEVDF does EDF like scheduling in the left half of the
    tree -- those entities that are owed service. Except because this is a
    virtual time scheduler, the deadlines are in virtual time as well,
    which is what allows over-subscription.

    EEVDF has two parameters:

     - weight, or time-slope: which is mapped to nice just as before

     - request size, or slice length: which is used to compute
       the virtual deadline as: vd_i = ve_i + r_i/w_i

    Basically, by setting a smaller slice, the deadline will be earlier
    and the task will be more eligible and ran earlier.

    Tick driven preemption is driven by request/slice completion; while
    wakeup preemption is driven by the deadline.

    Because the tree is now effectively an interval tree, and the
    selection is no longer 'leftmost', over-scheduling is less of a
    problem.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230531124603.931005524@infradead.org
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit fa237d82950d832b045ca398b0bdd7088422cb37
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed May 31 13:58:42 2023 +0200

    sched/fair: Add lag based placement

    With the introduction of avg_vruntime, it is possible to approximate
    lag (the entire purpose of introducing it in fact). Use this to do lag
    based placement over sleep+wake.

    Specifically, the FAIR_SLEEPERS thing places things too far to the
    left and messes up the deadline aspect of EEVDF.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230531124603.794929315@infradead.org
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 6063bb0c8a1acd02f35d9de3a2f73c17fd99c3f2
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed May 31 13:58:41 2023 +0200

    sched/fair: Remove sched_feat(START_DEBIT)

    With the introduction of avg_vruntime() there is no need to use worse
    approximations. Take the 0-lag point as starting point for inserting
    new tasks.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230531124603.722361178@infradead.org
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 45d55875264f7f7756c814df55a0c505de1f13c0
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed May 31 13:58:40 2023 +0200

    sched/fair: Add cfs_rq::avg_vruntime

    In order to move to an eligibility based scheduling policy, we need
    to have a better approximation of the ideal scheduler.

    Specifically, for a virtual time weighted fair queueing based
    scheduler the ideal scheduler will be the weighted average of the
    individual virtual runtimes (math in the comment).

    As such, compute the weighted average to approximate the ideal
    scheduler -- note that the approximation is in the individual task
    behaviour, which isn't strictly conformant.

    Specifically consider adding a task with a vruntime left of center, in
    this case the average will move backwards in time -- something the
    ideal scheduler would of course never do.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230531124603.654144274@infradead.org
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 482a42257efb9da9db63f03a8af5d5f07d3518f6
Author: Jiang Biao <benbjiang@tencent.com>
Date:   Tue Aug 11 19:32:09 2020 +0800

    sched/fair: Simplify the work when reweighting entity

    The code in reweight_entity() can be simplified.

    For a sched entity on the rq, the entity accounting can be replaced by
    cfs_rq instantaneous load updates currently called from within the
    entity accounting.

    Even though an entity on the rq can't represent a task in
    reweight_entity() (a task is always dequeued before calling this
    function) and so the numa task accounting and the rq->cfs_tasks list
    management of the entity accounting are never called, the redundant
    cfs_rq->nr_running decrement/increment will be avoided.

    Signed-off-by: Jiang Biao <benbjiang@tencent.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Link: https://lkml.kernel.org/r/20200811113209.34057-1-benbjiang@tencent.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit d21036a9134537a7e56654ea6191797905c78148
Author: Vincent Donnefort <vincent.donnefort@arm.com>
Date:   Tue Jun 21 10:04:08 2022 +0100

    sched/fair: Provide u64 read for 32-bits arch helper

    Introducing macro helpers u64_u32_{store,load}() to factorize lockless
    accesses to u64 variables for 32-bits architectures.

    Users are for now cfs_rq.min_vruntime and sched_avg.last_update_time. To
    accommodate the later where the copy lies outside of the structure
    (cfs_rq.last_udpate_time_copy instead of sched_avg.last_update_time_copy),
    use the _copy() version of those helpers.

    Those new helpers encapsulate smp_rmb() and smp_wmb() synchronization and
    therefore, have a small penalty for 32-bits machines in set_task_rq_fair()
    and init_cfs_rq().

    Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
    Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Tested-by: Lukasz Luba <lukasz.luba@arm.com>
    Link: https://lkml.kernel.org/r/20220621090414.433602-2-vdonnefort@google.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit fded4adb0d75b4c4c5400278f277a5ff8a782020
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed Apr 29 17:04:12 2020 +0200

    rbtree, sched/fair: Use rb_add_cached()

    Reduce rbtree boiler plate by using the new helper function.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Acked-by: Davidlohr Bueso <dbueso@suse.de>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 759f0c10c642f4917e94b9ad3a29a68cfe02f1a5
Author: Michel Lespinasse <walken@google.com>
Date:   Wed Sep 25 16:46:10 2019 -0700

    augmented rbtree: rework the RB_DECLARE_CALLBACKS macro definition

    Change the definition of the RBCOMPUTE function.  The propagate callback
    repeatedly calls RBCOMPUTE as it moves from leaf to root.  it wants to
    stop recomputing once the augmented subtree information doesn't change.
    This was previously checked using the == operator, but that only works
    when the augmented subtree information is a scalar field.  This commit
    modifies the RBCOMPUTE function so that it now sets the augmented subtree
    information instead of returning it, and returns a boolean value
    indicating if the propagate callback should stop.

    The motivation for this change is that I want to introduce augmented
    rbtree uses where the augmented data for the subtree is a struct instead
    of a scalar.

    Link: http://lkml.kernel.org/r/20190703040156.56953-4-walken@google.com
    Signed-off-by: Michel Lespinasse <walken@google.com>
    Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: David Howells <dhowells@redhat.com>
    Cc: Davidlohr Bueso <dbueso@suse.de>
    Cc: Uladzislau Rezki <urezki@gmail.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 9372e034d9366c3cad0772ee1b310652d90bcb70
Author: Michel Lespinasse <walken@google.com>
Date:   Wed Sep 25 16:46:07 2019 -0700

    augmented rbtree: add new RB_DECLARE_CALLBACKS_MAX macro

    Add RB_DECLARE_CALLBACKS_MAX, which generates augmented rbtree callbacks
    for the case where the augmented value is a scalar whose definition
    follows a max(f(node)) pattern.  This actually covers all present uses of
    RB_DECLARE_CALLBACKS, and saves some (source) code duplication in the
    various RBCOMPUTE function definitions.

    [walken@google.com: fix mm/vmalloc.c]
      Link: http://lkml.kernel.org/r/CANN689FXgK13wDYNh1zKxdipeTuALG4eKvKpsdZqKFJ-rvtGiQ@mail.gmail.com
    [walken@google.com: re-add check to check_augmented()]
      Link: http://lkml.kernel.org/r/20190727022027.GA86863@google.com
    Link: http://lkml.kernel.org/r/20190703040156.56953-3-walken@google.com
    Signed-off-by: Michel Lespinasse <walken@google.com>
    Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: David Howells <dhowells@redhat.com>
    Cc: Davidlohr Bueso <dbueso@suse.de>
    Cc: Uladzislau Rezki <urezki@gmail.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit a3175d39d501e3512d4df600939201cbda8e9b33
Author: Michel Lespinasse <walken@google.com>
Date:   Wed Sep 25 16:46:04 2019 -0700

    augmented rbtree: add comments for RB_DECLARE_CALLBACKS macro

    Patch series "make RB_DECLARE_CALLBACKS more generic", v3.

    These changes are intended to make the RB_DECLARE_CALLBACKS macro more
    generic (allowing the aubmented subtree information to be a struct instead
    of a scalar).

    I have verified the compiled lib/interval_tree.o and mm/mmap.o files to
    check that they didn't change.  This held as expected for interval_tree.o;
    mmap.o did have some changes which could be reverted by marking
    __vma_link_rb as noinline.  I did not add such a change to the patchset; I
    felt it was reasonable enough to leave the inlining decision up to the
    compiler.

    This patch (of 3):

    Add a short comment summarizing the arguments to RB_DECLARE_CALLBACKS.
    The arguments are also now capitalized.  This copies the style of the
    INTERVAL_TREE_DEFINE macro.

    No functional changes in this commit, only comments and capitalization.

    Link: http://lkml.kernel.org/r/20190703040156.56953-2-walken@google.com
    Signed-off-by: Michel Lespinasse <walken@google.com>
    Acked-by: Davidlohr Bueso <dbueso@suse.de>
    Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: David Howells <dhowells@redhat.com>
    Cc: Uladzislau Rezki <urezki@gmail.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 9600cbd0d286c6cf86b68e88ea0600a219815d37
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed May 31 13:58:43 2023 +0200

    rbtree: Add rb_add_augmented_cached() helper

    While slightly sub-optimal, updating the augmented data while going
    down the tree during lookup would be faster -- alas the augment
    interface does not currently allow for that, provide a generic helper
    to add a node to an augmented cached tree.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230531124603.862983648@infradead.org
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 3773003b378a5a3e5e44d0083d05c7437fd8a4b5
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Mon Oct 9 10:36:53 2017 +0200

    sched/core: Ensure load_balance() respects the active_mask

    While load_balance() masks the source CPUs against active_mask, it had
    a hole against the destination CPU. Ensure the destination CPU is also
    part of the 'domain-mask & active-mask' set.

    Reported-by: Levin, Alexander (Sasha Levin) <alexander.levin@verizon.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Fixes: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds")
    Signed-off-by: Ingo Molnar <mingo@kernel.org>

commit 1a4a49b4cd14bdcf91e3c9de97aa98c912e11c83
Author: Wanpeng Li <wanpengli@tencent.com>
Date:   Mon Jan 13 08:50:27 2020 +0800

    [BACKPORT]sched/nohz: Optimize get_nohz_timer_target()

    On a machine, CPU 0 is used for housekeeping, the other 39 CPUs in the
    same socket are in nohz_full mode. We can observe huge time burn in the
    loop for seaching nearest busy housekeeper cpu by ftrace.

      2)               |                        get_nohz_timer_target() {
      2)   0.240 us    |                          housekeeping_test_cpu();
      2)   0.458 us    |                          housekeeping_test_cpu();

      ...

      2)   0.292 us    |                          housekeeping_test_cpu();
      2)   0.240 us    |                          housekeeping_test_cpu();
      2)   0.227 us    |                          housekeeping_any_cpu();
      2) + 43.460 us   |                        }

    This patch optimizes the searching logic by finding a nearest housekeeper
    CPU in the housekeeping cpumask, it can minimize the worst searching time
    from ~44us to < 10us in my testing. In addition, the last iterated busy
    housekeeper can become a random candidate while current CPU is a better
    fallback if it is a housekeeper.

    Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
    Link: https://lkml.kernel.org/r/1578876627-11938-1-git-send-email-wanpengli@tencent.com
    Signed-off-by: DennySPB <dennyspb@gmail.com>

commit f773f3b275ad618158280c9793a1d27639f298f7
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Fri Dec 14 17:01:56 2018 +0100

    sched/fair: Trigger asym_packing during idle load balance

    Newly idle load balancing is not always triggered when a CPU becomes idle.
    This prevents the scheduler from getting a chance to migrate the task
    for asym packing.

    Enable active migration during idle load balance too.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: valentin.schneider@arm.com
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Jesse Chan <jc@linux.com>
    Signed-off-by: billaids <jimmy.nelle@hsw-stud.de>

commit 75352e3fb31924ec3af106be73e210d2258fefe8
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Mon Oct 9 10:36:53 2017 +0200

    sched/core: Ensure load_balance() respects the active_mask

    While load_balance() masks the source CPUs against active_mask, it had
    a hole against the destination CPU. Ensure the destination CPU is also
    part of the 'domain-mask & active-mask' set.

    Reported-by: Levin, Alexander (Sasha Levin) <alexander.levin@verizon.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Fixes: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds")
    Signed-off-by: Ingo Molnar <mingo@kernel.org>

commit 74b6dcb03b6b9fa56fbcdf2ffb33e50fa97db6dc
Author: Frederic Weisbecker <frederic@kernel.org>
Date:   Tue Dec 3 17:01:06 2019 +0100

    sched: Use fair:prio_changed() instead of ad-hoc implementation

    set_user_nice() implements its own version of fair::prio_changed() and
    therefore misses a specific optimization towards nohz_full CPUs that
    avoid sending an resched IPI to a reniced task running alone. Use the
    proper callback instead.

    Change-Id: I51ba67826dfcec0aa423758281943c01ba267c91
    Reported-by: Peter Zijlstra <peterz@infradead.org>
    Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Ingo Molnar <mingo@kernel.org>
    Link: https://lkml.kernel.org/r/20191203160106.18806-3-frederic@kernel.org
    Signed-off-by: mydongistiny <jaysonedson@gmail.com>
    Signed-off-by: DennySPB <dennyspb@gmail.com>

commit 1994e58a6bf1726d32a1c58af16b2be6fdfafd80
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Thu Apr 26 12:19:32 2018 +0200

    sched/fair: Fix the update of blocked load when newly idle

    With commit:

      31e77c93e432 ("sched/fair: Update blocked load when newly idle")

    ... we release the rq->lock when updating blocked load of idle CPUs.

    This opens a time window during which another CPU can add a task to this
    CPU's cfs_rq.

    The check for newly added task of idle_balance() is not in the common path.
    Move the out label to include this check.

    Reported-by: Heiner Kallweit <hkallweit1@gmail.com>
    Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Fixes: 31e77c93e432 ("sched/fair: Update blocked load when newly idle")
    Link: http://lkml.kernel.org/r/20180426103133.GA6953@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>

commit ddd49e696ad91a561a8c8bf12ef61bc0e113ac41
Author: Josh Don <joshdon@google.com>
Date:   Tue Aug 4 12:34:13 2020 -0700

    sched/fair: Ignore cache hotness for SMT migration

    SMT siblings share caches, so cache hotness should be irrelevant for
    cross-sibling migration.

    Signed-off-by: Josh Don <joshdon@google.com>
    Proposed-by: Venkatesh Pallipadi <venki@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20200804193413.510651-1-joshdon@google.com

commit de8674e8809c07543874ec4a4c91631c481caf4f
Author: Peter Oskolkov <posk@google.com>
Date:   Wed Sep 30 10:35:32 2020 -0700

    sched/fair: Tweak pick_next_entity()

    Currently, pick_next_entity(...) has the following structure
    (simplified):

      [...]
      if (last_buddy_ok())
        result = last_buddy;
      if (next_buddy_ok())
        result = next_buddy;
      [...]

    The intended behavior is to prefer next buddy over last buddy;
    the current code somewhat obfuscates this, and also wastes
    cycles checking the last buddy when eventually the next buddy is
    picked up.

    So this patch refactors two 'ifs' above into

      [...]
      if (next_buddy_ok())
          result = next_buddy;
      else if (last_buddy_ok())
          result = last_buddy;
      [...]

    Signed-off-by: Peter Oskolkov <posk@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Vincent Guittot <vincent.guitttot@linaro.org>
    Link: https://lkml.kernel.org/r/20200930173532.1069092-1-posk@google.com

commit 6a20c355412765268bfc7635b626899873f73337
Author: Clement Courbet <courbet@google.com>
Date:   Wed Mar 3 14:46:53 2021 -0800

    sched: Optimize __calc_delta()

    A significant portion of __calc_delta() time is spent in the loop
    shifting a u64 by 32 bits. Use `fls` instead of iterating.

    This is ~7x faster on benchmarks.

    The generic `fls` implementation (`generic_fls`) is still ~4x faster
    than the loop.
    Architectures that have a better implementation will make use of it. For
    example, on x86 we get an additional factor 2 in speed without dedicated
    implementation.

    On GCC, the asm versions of `fls` are about the same speed as the
    builtin. On Clang, the versions that use fls are more than twice as
    slow as the builtin. This is because the way the `fls` function is
    written, clang puts the value in memory:
    https://godbolt.org/z/EfMbYe. This bug is filed at
    https://bugs.llvm.org/show_bug.cgi?idI406.

    ```
    name                                   cpu/op
    BM_Calc<__calc_delta_loop>             9.57ms Â=B112%
    BM_Calc<__calc_delta_generic_fls>      2.36ms Â=B113%
    BM_Calc<__calc_delta_asm_fls>          2.45ms Â=B113%
    BM_Calc<__calc_delta_asm_fls_nomem>    1.66ms Â=B112%
    BM_Calc<__calc_delta_asm_fls64>        2.46ms Â=B113%
    BM_Calc<__calc_delta_asm_fls64_nomem>  1.34ms Â=B115%
    BM_Calc<__calc_delta_builtin>          1.32ms Â=B111%
    ```

    Signed-off-by: Clement Courbet <courbet@google.com>
    Signed-off-by: Josh Don <joshdon@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20210303224653.2579656-1-joshdon@google.com

commit e4793308d8214d546ae2eead6f9fac0426ada0f9
Author: Amit Kucheria <amit.kucheria@linaro.org>
Date:   Mon Oct 21 17:45:12 2019 +0530

    cpufreq: Initialize the governors in core_initcall

    Initialize the cpufreq governors earlier to allow for earlier
    performance control during the boot process.

    Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
    Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
    Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
    Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
    Link: https://lore.kernel.org/r/b98eae9b44eb2f034d7f5d12a161f5f831be1eb7.1571656015.git.amit.kucheria@linaro.org

    # Conflicts:
    #	drivers/cpufreq/cpufreq_performance.c

commit 7f1bc529bc787dcbedfd2899cb9c99326dd57215
Author: Mathieu Poirier <mathieu.poirier@linaro.org>
Date:   Fri Jul 19 15:59:53 2019 +0200

    sched/topology: Add partition_sched_domains_locked()

    Introduce the partition_sched_domains_locked() function by taking
    the mutex locking code out of the original function.  That way
    the work done by partition_sched_domains_locked() can be reused
    without dropping the mutex lock.

    No change of functionality is introduced by this patch.

    Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Acked-by: Tejun Heo <tj@kernel.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: bristot@redhat.com
    Cc: claudio@evidence.eu.com
    Cc: lizefan@huawei.com
    Cc: longman@redhat.com
    Cc: luca.abeni@santannapisa.it
    Cc: rostedt@goodmis.org
    Cc: tommaso.cucinotta@santannapisa.it
    Link: https://lkml.kernel.org/r/20190719140000.31694-2-juri.lelli@redhat.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>

commit e17f27e0d39d77e4b1011278796a30e8d7b05d6f
Author: Valentin Schneider <valentin.schneider@arm.com>
Date:   Tue Apr 9 18:35:45 2019 +0100

    sched/topology: Skip duplicate group rewrites in build_sched_groups()

    While staring at build_sched_domains(), I realized that get_group()
    does several duplicate (thus useless) writes.

    If you take the Arm Juno r0 (LITTLEs = [0, 3, 4, 5], bigs = [1, 2]), the
    sched_group build flow would look like this:

    ('MC[cpu]->sg' means 'per_cpu_ptr(&tl->data->sg, cpu)' with 'tl == MC')

    build_sched_groups(MC[CPU0]->sd, CPU0)
      get_group(0) -> MC[CPU0]->sg
      get_group(3) -> MC[CPU3]->sg
      get_group(4) -> MC[CPU4]->sg
      get_group(5) -> MC[CPU5]->sg

    build_sched_groups(DIE[CPU0]->sd, CPU0)
      get_group(0) -> DIE[CPU0]->sg
      get_group(1) -> DIE[CPU1]->sg <=================+
    						  |
    build_sched_groups(MC[CPU1]->sd, CPU1)            |
      get_group(1) -> MC[CPU1]->sg                    |
      get_group(2) -> MC[CPU2]->sg                    |
    						  |
    build_sched_groups(DIE[CPU1]->sd, CPU1)           ^
      get_group(1) -> DIE[CPU1]->sg  } We've set up these two up here!
      get_group(3) -> DIE[CPU0]->sg  }

    From this point on, we will only use sched_groups that have been
    previously visited & initialized. The only new operation will
    be which group pointer we affect to sd->groups.

    On the Juno r0 we get 32 get_group() calls, every single one of them
    writing to a sched_group->cpumask. However, all of the data structures
    we need are set up after 8 visits (see above).

    Return early from get_group() if we've already visited (and thus
    initialized) the sched_group we're looking at. Overlapping domains
    are not affected as they do not use build_sched_groups().

    Tested on a Juno and a 2 * (Xeon E5-2690) system.

    ( FWIW I initially checked the refs for both sg && sg->sgc, but figured if
      they weren't both 0 or > 1 then something must have gone wrong, so I
      threw in a WARN_ON(). )

    No change in functionality intended.

    Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>

commit c62d6453c251607e2fe6f6e204a0711b7edc03df
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Mon Jun 17 17:00:17 2019 +0200

    sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity()

    The 'struct sched_domain *sd' parameter to arch_scale_cpu_capacity() is
    unused since commit:

      765d0af19f5f ("sched/topology: Remove the ::smt_gain field from 'struct sched_domain'")

    Remove it.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
    Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: gregkh@linuxfoundation.org
    Cc: linux@armlinux.org.uk
    Cc: quentin.perret@arm.com
    Cc: rafael@kernel.org
    Link: https://lkml.kernel.org/r/1560783617-5827-1-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>

commit 360635bafb36f39478a5c32b43e3cb64b1df4dad
Author: John Galt <johngaltfirstrun@gmail.com>
Date:   Mon Apr 22 16:01:47 2024 -0400

    sched-pelt: make half life nonconfigurable

    16ms only

commit baa288445831e3cef3b7f6cd20bc0c52f331acc2
Author: John Galt <johngaltfirstrun@gmail.com>
Date:   Mon Apr 22 15:32:50 2024 -0400

    sched/fair: decrease migration cost

commit 7e1c7b478d201f3909d4c8f1f9f121064457f4e9
Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date:   Tue Aug 1 17:26:48 2023 +0200

    sched/rt: Don't try push tasks if there are none.

    I have a RT task X at a high priority and cyclictest on each CPU with
    lower priority than X's. If X is active and each CPU wakes their own
    cylictest thread then it ends in a longer rto_push storm.
    A random CPU determines via balance_rt() that the CPU on which X is
    running needs to push tasks. X has the highest priority, cyclictest is
    next in line so there is nothing that can be done since the task with
    the higher priority is not touched.

    tell_cpu_to_push() increments rto_loop_next and schedules
    rto_push_irq_work_func() on X's CPU. The other CPUs also increment the
    loop counter and do the same. Once rto_push_irq_work_func() is active it
    does nothing because it has _no_ pushable tasks on its runqueue. Then
    checks rto_next_cpu() and decides to queue irq_work on the local CPU
    because another CPU requested a push by incrementing the counter.

    I have traces where ~30 CPUs request this ~3 times each before it
    finally ends. This greatly increases X's runtime while X isn't making
    much progress.

    Teach rto_next_cpu() to only return CPUs which also have tasks on their
    runqueue which can be pushed away. This does not reduce the
    tell_cpu_to_push() invocations (rto_loop_next counter increments) but
    reduces the amount of issued rto_push_irq_work_func() if nothing can be
    done. As the result the overloaded CPU is blocked less often.

    There are still cases where the "same job" is repeated several times
    (for instance the current CPU needs to resched but didn't yet because
    the irq-work is repeated a few times and so the old task remains on the
    CPU) but the majority of request end in tell_cpu_to_push() before an IPI
    is issued.

    Reviewed-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
    Link: https://lore.kernel.org/r/20230801152648._y603AS_@linutronix.de
    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

commit 93662a6fbca530b6480fc92e9d119c5ccf09da7f
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Wed May 13 15:55:28 2020 +0200

    sched/fair: Fix unthrottle_cfs_rq() for leaf_cfs_rq list

    Although not exactly identical, unthrottle_cfs_rq() and enqueue_task_fair()
    are quite close and follow the same sequence for enqueuing an entity in the
    cfs hierarchy. Modify unthrottle_cfs_rq() to use the same pattern as
    enqueue_task_fair(). This fixes a problem already faced with the latter and
    add an optimization in the last for_each_sched_entity loop.

    Fixes: fe61468b2cb (sched/fair: Fix enqueue_task_fair warning)
    Reported-by Tao Zhou <zohooouoto@zoho.com.cn>
    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Phil Auld <pauld@redhat.com>
    Reviewed-by: Ben Segall <bsegall@google.com>
    Link: https://lkml.kernel.org/r/20200513135528.4742-1-vincent.guittot@linaro.org

commit e8164ed37c41537ee1db3584095cd533705d5ad5
Author: Josh Don <joshdon@google.com>
Date:   Fri Apr 10 15:52:08 2020 -0700

    sched/fair: Remove distribute_running from CFS bandwidth

    This is mostly a revert of commit:

      baa9be4ffb55 ("sched/fair: Fix throttle_list starvation with low CFS quota")

    The primary use of distribute_running was to determine whether to add
    throttled entities to the head or the tail of the throttled list. Now
    that we always add to the tail, we can remove this field.

    The other use of distribute_running is in the slack_timer, so that we
    don't start a distribution while one is already running. However, even
    in the event that this race occurs, it is fine to have two distributions
    running (especially now that distribute grabs the cfs_b->lock to
    determine remaining quota before assigning).

    Signed-off-by: Josh Don <joshdon@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Phil Auld <pauld@redhat.com>
    Tested-by: Phil Auld <pauld@redhat.com>
    Link: https://lkml.kernel.org/r/20200410225208.109717-3-joshdon@google.com

commit 1eb50996e28558aabcd74d3afb37e044fedac26d
Author: bsegall@google.com <bsegall@google.com>
Date:   Thu Jun 6 10:21:01 2019 -0700

    sched/fair: Don't push cfs_bandwith slack timers forward

    When a cfs_rq sleeps and returns its quota, we delay for 5ms before
    waking any throttled cfs_rqs to coalesce with other cfs_rqs going to
    sleep, as this has to be done outside of the rq lock we hold.

    The current code waits for 5ms without any sleeps, instead of waiting
    for 5ms from the first sleep, which can delay the unthrottle more than
    we want. Switch this around so that we can't push this forward forever.

    This requires an extra flag rather than using hrtimer_active, since we
    need to start a new timer if the current one is in the process of
    finishing.

    Signed-off-by: Ben Segall <bsegall@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
    Acked-by: Phil Auld <pauld@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Link: https://lkml.kernel.org/r/xm26a7euy6iq.fsf_-_@bsegall-linux.svl.corp.google.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: DennySPB <dennyspb@gmail.com>

commit 2d4490a11d8952e8016647a5bbfa98f6df781dde
Author: Paul Turner <pjt@google.com>
Date:   Fri Apr 10 15:52:07 2020 -0700

    sched/fair: Eliminate bandwidth race between throttling and distribution

    There is a race window in which an entity begins throttling before quota
    is added to the pool, but does not finish throttling until after we have
    finished with distribute_cfs_runtime(). This entity is not observed by
    distribute_cfs_runtime() because it was not on the throttled list at the
    time that distribution was running. This race manifests as rare
    period-length statlls for such entities.

    Rather than heavy-weight the synchronization with the progress of
    distribution, we can fix this by aborting throttling if bandwidth has
    become available. Otherwise, we immediately add the entity to the
    throttled list so that it can be observed by a subsequent distribution.

    Additionally, we can remove the case of adding the throttled entity to
    the head of the throttled list, and simply always add to the tail.
    Thanks to 26a8b12747c97, distribute_cfs_runtime() no longer holds onto
    its own pool of runtime. This means that if we do hit the !assign and
    distribute_running case, we know that distribution is about to end.

    Signed-off-by: Paul Turner <pjt@google.com>
    Signed-off-by: Ben Segall <bsegall@google.com>
    Signed-off-by: Josh Don <joshdon@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Phil Auld <pauld@redhat.com>
    Link: https://lkml.kernel.org/r/20200410225208.109717-2-joshdon@google.com

commit 4f08342e65cedd1082630c987d9a7bafa7703c70
Author: Huaixin Chang <changhuaixin@linux.alibaba.com>
Date:   Fri Mar 27 11:26:25 2020 +0800

    sched/fair: Fix race between runtime distribution and assignment

    Currently, there is a potential race between distribute_cfs_runtime()
    and assign_cfs_rq_runtime(). Race happens when cfs_b->runtime is read,
    distributes without holding lock and finds out there is not enough
    runtime to charge against after distribution. Because
    assign_cfs_rq_runtime() might be called during distribution, and use
    cfs_b->runtime at the same time.

    Fibtest is the tool to test this race. Assume all gcfs_rq is throttled
    and cfs period timer runs, slow threads might run and sleep, returning
    unused cfs_rq runtime and keeping min_cfs_rq_runtime in their local
    pool. If all this happens sufficiently quickly, cfs_b->runtime will drop
    a lot. If runtime distributed is large too, over-use of runtime happens.

    A runtime over-using by about 70 percent of quota is seen when we
    test fibtest on a 96-core machine. We run fibtest with 1 fast thread and
    95 slow threads in test group, configure 10ms quota for this group and
    see the CPU usage of fibtest is 17.0%, which is far more than the
    expected 10%.

    On a smaller machine with 32 cores, we also run fibtest with 96
    threads. CPU usage is more than 12%, which is also more than expected
    10%. This shows that on similar workloads, this race do affect CPU
    bandwidth control.

    Solve this by holding lock inside distribute_cfs_runtime().

    Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
    Reviewed-by: Ben Segall <bsegall@google.com>
    Signed-off-by: Huaixin Chang <changhuaixin@linux.alibaba.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/lkml/20200325092602.22471-1-changhuaixin@linux.alibaba.com/

commit 455896c7c0c647b57e0b742d00b3b65aa8f02e87
Author: kondors1995 <normandija1945@gmail.com>
Date:   Thu Apr 4 21:48:12 2024 +0300

    sched/cass:fixup

commit 4b88ec1c7957d055e3a23067908fa0ec47cbd2c5
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Wed Mar 13 21:25:29 2024 -0700

    sched/cass: Eliminate redundant calls to smp_processor_id()

    Calling smp_processor_id() can be expensive depending on how an arch
    implements it, so avoid calling it more than necessary.

    Use the raw variant too since this code is always guaranteed to run with
    preemption disabled.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

commit f7d512d05d97dea633689bf689fd8a63960248ca
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Tue Mar 12 16:55:56 2024 -0700

    sched/cass: Only treat sync waker CPU as idle if there's one task running

    For synchronized wakes, the waker's CPU should only be treated as idle if
    there aren't any other running tasks on that CPU. This is because, for
    synchronized wakes, it is assumed that the waker will immediately go to
    sleep after waking the wakee; therefore, if there aren't any other tasks
    running on the waker's CPU, it'll go idle and should be treated as such to
    improve task placement.

    This optimization only applies when there aren't any other tasks running on
    the waker's CPU, however.

    Fix it by ensuring that there's only the waker running on its CPU.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

commit 3264ea2961b6a7a2e5589b199c031aa17361401a
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Mon Feb 19 13:13:02 2024 -0800

    sched/cass: Fix suboptimal task placement when uclamp is used

    Uclamp is designed to specify a process' CPU performance requirement scaled
    as a CPU capacity value. It simply denotes the process' requirement for the
    CPU's raw performance and thus P-state.

    CASS currently treats uclamp as a CPU load value however, producing wildly
    suboptimal CPU placement decisions for tasks which use uclamp. This hurts
    performance and, even worse, massively hurts energy efficiency, with CASS
    sometimes yielding power consumption that is a few times higher than EAS.

    Since uclamp inherently throws a wrench into CASS's goal of keeping
    relative P-states as low as possible across all CPUs, making it cooperate
    with CASS requires a multipronged approach.

    Make the following three changes to fix the uclamp task placement issue:
      1. Treat uclamp as a CPU performance value rather than a CPU load value.
      2. Clamp a CPU's utilization to the task's uclamp floor in order to keep
         relative P-states as low as possible across all CPUs.
      3. Consider preferring a non-idle CPU for uclamped tasks to avoid pushing
         up the P-state of more than one CPU when there are multiple concurrent
         uclamped tasks.

    This fixes CASS's massive energy efficiency and performance issues when
    uclamp is used.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

commit dc098ba6aab229c19099642f561d33562f1ff755
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Sat Jan 6 00:34:48 2024 -0800

    sched/cass: Perform runqueue selection for RT tasks too

    RT tasks aren't placed on CPUs in a load-balanced manner, much less an
    energy efficient one. On systems which contain many RT tasks and/or IRQ
    threads, energy efficiency and throughput are diminished significantly by
    the default RT runqueue selection scheme which targets minimal latency.

    In practice, performance is actually improved by spreading RT tasks fairly,
    despite the small latency impact. Additionally, energy efficiency is
    significantly improved since the placement of all tasks benefits from
    energy-efficient runqueue selection, rather than just CFS tasks.

    Perform runqueue selection for RT tasks in CASS to significantly improve
    energy efficiency and overall performance.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

commit c8bbbe7148731e0024d0681f6cc8738cbe1cee06
Author: kondors1995 <normandija1945@gmail.com>
Date:   Thu Apr 4 12:48:06 2024 +0300

    sched/cass:checkout to kerneltoast/android_kernel_google_zuma@63f0b82d3

commit 1d4260eba475bad8a2aba7a974e854baefeb1040
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Wed Feb 28 21:07:44 2024 -0800

    sched/cass: Clean up local variable scope in cass_best_cpu()

    Move `curr` and `idle_state` to within the loop's scope for better
    readability. Also, leave a comment about `curr->cpu` to make it clear that
    `curr->cpu` must be initialized within the loop in order for `best->cpu` to
    be valid.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

commit cc47ef1ef764b7d261402bfa04a42dc4a418ab0b
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Sun Dec 17 19:04:28 2023 -0800

    sched/cass: Fix CPU selection when no candidate CPUs are idle

    When no candidate CPUs are idle, CASS would keep `cidx` unchanged, and thus
    `best == curr` would always be true. As a result, since the empty candidate
    slot never changes, the current candidate `curr` always overwrites the best
    candidate `best`. This causes the last valid CPU to always be selected by
    CASS when no CPUs are idle (i.e., under heavy load).

    Fix it by ensuring that the CPU loop in cass_best_cpu() flips the free
    candidate index after the first candidate CPU is evaluated.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

commit 2dec239da28c7e1d19ead79686f5114e417e6d65
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Mon Nov 5 14:54:00 2018 +0000

    UPSTREAM: sched/fair: Add lsub_positive() and use it consistently

    The following pattern:

       var -= min_t(typeof(var), var, val);

    is used multiple times in fair.c.

    The existing sub_positive() already captures that pattern, but it also
    adds an explicit load-store to properly support lockless observations.
    In other cases the pattern above is used to update local, and/or not
    concurrently accessed, variables.

    Let's add a simpler version of sub_positive(), targeted at local variables
    updates, which gives the same readability benefits at calling sites,
    without enforcing {READ,WRITE}_ONCE() barriers.

    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Link: https://lore.kernel.org/lkml/20181031184527.GA3178@hirez.programming.kicks-ass.net
    Change-Id: I6a6a3b2ae9e4baa4ab6e906bf2aaed7306303025
    Signed-off-by: Jason Edson <jaysonedson@gmail.com>
    Signed-off-by: DennySPb <dennyspb@gmail.com>
    Signed-off-by: Tashfin Shakeer Rhythm <tashfinshakeerrhythm@gmail.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit fc9b2f93da21cddfc7d76fa941eb496935e48db4
Author: Rohit Jain <rohit.k.jain@oracle.com>
Date:   Wed May 2 13:52:10 2018 -0700

    sched/core: Don't schedule threads on pre-empted vCPUs

    In paravirt configurations today, spinlocks figure out whether a vCPU is
    running to determine whether or not spinlock should bother spinning. We
    can use the same logic to prioritize CPUs when scheduling threads. If a
    vCPU has been pre-empted, it will incur the extra cost of VMENTER and
    the time it actually spends to be running on the host CPU. If we had
    other vCPUs which were actually running on the host CPU and idle we
    should schedule threads there.

    Performance numbers:

    Note: With patch is referred to as Paravirt in the following and without
    patch is referred to as Base.

    1) When only 1 VM is running:

        a) Hackbench test on KVM 8 vCPUs, 10,000 loops (lower is better):

    	+-------+-----------------+----------------+
    	|Number |Paravirt         |Base            |
    	|of     +---------+-------+-------+--------+
    	|Threads|Average  |Std Dev|Average| Std Dev|
    	+-------+---------+-------+-------+--------+
    	|1      |1.817    |0.076  |1.721  | 0.067  |
    	|2      |3.467    |0.120  |3.468  | 0.074  |
    	|4      |6.266    |0.035  |6.314  | 0.068  |
    	|8      |11.437   |0.105  |11.418 | 0.132  |
    	|16     |21.862   |0.167  |22.161 | 0.129  |
    	|25     |33.341   |0.326  |33.692 | 0.147  |
    	+-------+---------+-------+-------+--------+

    2) When two VMs are running with same CPU affinities:

        a) tbench test on VM 8 cpus

        Base:

    	VM1:

    	Throughput 220.59 MB/sec   1 clients  1 procs  max_latency=12.872 ms
    	Throughput 448.716 MB/sec  2 clients  2 procs  max_latency=7.555 ms
    	Throughput 861.009 MB/sec  4 clients  4 procs  max_latency=49.501 ms
    	Throughput 1261.81 MB/sec  7 clients  7 procs  max_latency=76.990 ms

    	VM2:

    	Throughput 219.937 MB/sec  1 clients  1 procs  max_latency=12.517 ms
    	Throughput 470.99 MB/sec   2 clients  2 procs  max_latency=12.419 ms
    	Throughput 841.299 MB/sec  4 clients  4 procs  max_latency=37.043 ms
    	Throughput 1240.78 MB/sec  7 clients  7 procs  max_latency=77.489 ms

        Paravirt:

    	VM1:

    	Throughput 222.572 MB/sec  1 clients  1 procs  max_latency=7.057 ms
    	Throughput 485.993 MB/sec  2 clients  2 procs  max_latency=26.049 ms
    	Throughput 947.095 MB/sec  4 clients  4 procs  max_latency=45.338 ms
    	Throughput 1364.26 MB/sec  7 clients  7 procs  max_latency=145.124 ms

    	VM2:

    	Throughput 224.128 MB/sec  1 clients  1 procs  max_latency=4.564 ms
    	Throughput 501.878 MB/sec  2 clients  2 procs  max_latency=11.061 ms
    	Throughput 965.455 MB/sec  4 clients  4 procs  max_latency=45.370 ms
    	Throughput 1359.08 MB/sec  7 clients  7 procs  max_latency=168.053 ms

        b) Hackbench with 4 fd 1,000,000 loops

    	+-------+--------------------------------------+----------------------------------------+
    	|Number |Paravirt                              |Base                                    |
    	|of     +----------+--------+---------+--------+----------+--------+---------+----------+
    	|Threads|Average1  |Std Dev1|Average2 | Std Dev|Average1  |Std Dev1|Average2 | Std Dev 2|
    	+-------+----------+--------+---------+--------+----------+--------+---------+----------+
    	|  1    | 3.748    | 0.620  | 3.576   | 0.432  | 4.006    | 0.395  | 3.446   | 0.787    |
    	+-------+----------+--------+---------+--------+----------+--------+---------+----------+

        Note that this test was run just to show the interference effect
        over-subscription can have in baseline

        c) schbench results with 2 message groups on 8 vCPU VMs

    	+-----------+-------+---------------+--------------+------------+
    	|           |       | Paravirt      | Base         |            |
    	+-----------+-------+-------+-------+-------+------+------------+
    	|           |Threads| VM1   | VM2   |  VM1  | VM2  |%Improvement|
    	+-----------+-------+-------+-------+-------+------+------------+
    	|50.0000th  |    1  | 52    | 53    |  58   | 54   |  +6.25%    |
    	|75.0000th  |    1  | 69    | 61    |  83   | 59   |  +8.45%    |
    	|90.0000th  |    1  | 80    | 80    |  89   | 83   |  +6.98%    |
    	|95.0000th  |    1  | 83    | 83    |  93   | 87   |  +7.78%    |
    	|*99.0000th |    1  | 92    | 94    |  99   | 97   |  +5.10%    |
    	|99.5000th  |    1  | 95    | 100   |  102  | 103  |  +4.88%    |
    	|99.9000th  |    1  | 107   | 123   |  105  | 203  |  +25.32%   |
    	+-----------+-------+-------+-------+-------+------+------------+
    	|50.0000th  |    2  | 56    | 62    |  67   | 59   |  +6.35%    |
    	|75.0000th  |    2  | 69    | 75    |  80   | 71   |  +4.64%    |
    	|90.0000th  |    2  | 80    | 82    |  90   | 81   |  +5.26%    |
    	|95.0000th  |    2  | 85    | 87    |  97   | 91   |  +8.51%    |
    	|*99.0000th |    2  | 98    | 99    |  107  | 109  |  +8.79%    |
    	|99.5000th  |    2  | 107   | 105   |  109  | 116  |  +5.78%    |
    	|99.9000th  |    2  | 9968  | 609   |  875  | 3116 | -165.02%   |
    	+-----------+-------+-------+-------+-------+------+------------+
    	|50.0000th  |    4  | 78    | 77    |  78   | 79   |  +1.27%    |
    	|75.0000th  |    4  | 98    | 106   |  100  | 104  |   0.00%    |
    	|90.0000th  |    4  | 987   | 1001  |  995  | 1015 |  +1.09%    |
    	|95.0000th  |    4  | 4136  | 5368  |  5752 | 5192 |  +13.16%   |
    	|*99.0000th |    4  | 11632 | 11344 |  11024| 10736|  -5.59%    |
    	|99.5000th  |    4  | 12624 | 13040 |  12720| 12144|  -3.22%    |
    	|99.9000th  |    4  | 13168 | 18912 |  14992| 17824|  +2.24%    |
    	+-----------+-------+-------+-------+-------+------+------------+

        Note: Improvement is measured for (VM1+VM2)

    Signed-off-by: Rohit Jain <rohit.k.jain@oracle.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: dhaval.giani@oracle.com
    Cc: matt@codeblueprint.co.uk
    Cc: steven.sistare@oracle.com
    Cc: subhra.mazumdar@oracle.com
    Link: http://lkml.kernel.org/r/1525294330-7759-1-git-send-email-rohit.k.jain@oracle.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 495ad164dcd1731db6746346355ecf52a193c720
Author: EmanuelCN <emanuelghub@gmail.com>
Date:   Wed Apr 3 18:31:28 2024 +0000

    init: Enable SCHED_THERMAL_PRESSURE by default

    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 9170eccde2a4c93f9be5b4bc0bed7bce1277f8da
Author: Thara Gopinath <thara.gopinath@linaro.org>
Date:   Fri Feb 21 19:52:13 2020 -0500

    sched/fair: Enable tuning of decay period

    Thermal pressure follows pelt signals which means the decay period for
    thermal pressure is the default pelt decay period. Depending on SoC
    characteristics and thermal activity, it might be beneficial to decay
    thermal pressure slower, but still in-tune with the pelt signals.  One way
    to achieve this is to provide a command line parameter to set a decay
    shift parameter to an integer between 0 and 10.

    Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lkml.kernel.org/r/20200222005213.3873-10-thara.gopinath@linaro.org
    Signed-off-by: Tashfin Shakeer Rhythm <tashfinshakeerrhythm@gmail.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 0c8c7aaf09aea4f6296130605b75697fa21f2615
Author: Lukasz Luba <lukasz.luba@arm.com>
Date:   Thu Jun 10 16:03:22 2021 +0100

    UPSTREAM: thermal: cpufreq_cooling: Update also offline CPUs per-cpu thermal_pressure

    The thermal pressure signal gives information to the scheduler about
    reduced CPU capacity due to thermal. It is based on a value stored in a
    per-cpu 'thermal_pressure' variable. The online CPUs will get the new
    value there, while the offline won't. Unfortunately, when the CPU is back
    online, the value read from per-cpu variable might be wrong (stale data).
    This might affect the scheduler decisions, since it sees the CPU capacity
    differently than what is actually available.

    Fix it by making sure that all online+offline CPUs would get the proper
    value in their per-cpu variable when thermal framework sets capping.

    Fixes: f12e4f66ab6a3 ("thermal/cpu-cooling: Update thermal pressure in case of a maximum frequency capping")
    Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
    Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

    Link: https://lore.kernel.org/all/20210614191030.22241-1-lukasz.luba@arm.com/
    Bug: 199501011
    Change-Id: I10cceb48b72ccce1f51cfc0a7ecfa8d8e67d4394
    (cherry picked from commit 2ad8ccc17d1e4270cf65a3f2a07a7534aa23e3fb)
    Signed-off-by: Ram Chandrasekar <quic_rkumbako@quicinc.com>
    Signed-off-by: Divyanshu-Modi <divyan.m05@gmail.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit cdbf22d3d8bc1b7a384677f4af7ea20fd169c538
Author: Thara Gopinath <thara.gopinath@linaro.org>
Date:   Fri Feb 21 19:52:12 2020 -0500

    thermal: cpu-cooling: Update thermal pressure in case of a maximum frequency capping

    Thermal governors can request for a CPU's maximum supported frequency to
    be capped in case of an overheat event. This in turn means that the
    maximum capacity available for tasks to run on the particular CPU is
    reduced. Delta between the original maximum capacity and capped maximum
    capacity is known as thermal pressure. Enable cpufreq cooling device to
    update the thermal pressure in event of a capped maximum frequency.

    Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lkml.kernel.org/r/20200222005213.3873-9-thara.gopinath@linaro.org
    Signed-off-by: Divyanshu-Modi <divyan.m05@gmail.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit d81bc16f14b74721dcea680568e862361f531ca8
Author: Thara Gopinath <thara.gopinath@linaro.org>
Date:   Fri Feb 21 19:52:10 2020 -0500

    sched/fair: Enable periodic update of average thermal pressure

    Introduce support in scheduler periodic tick and other CFS bookkeeping
    APIs to trigger the process of computing average thermal pressure for a
    CPU. Also consider avg_thermal.load_avg in others_have_blocked which
    allows for decay of pelt signals.

    Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lkml.kernel.org/r/20200222005213.3873-7-thara.gopinath@linaro.org
    Signed-off-by: Divyanshu-Modi <divyan.m05@gmail.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 0b07e1ccac5d97bf63df30b1a9e36c99a1d59390
Author: Thara Gopinath <thara.gopinath@linaro.org>
Date:   Fri Feb 21 19:52:09 2020 -0500

    arm/topology: Populate arch_scale_thermal_pressure() for ARM platforms

    Hook up topology_get_thermal_pressure to arch_scale_thermal_pressure thus
    enabling scheduler to retrieve instantaneous thermal pressure of a CPU.

    Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lkml.kernel.org/r/20200222005213.3873-6-thara.gopinath@linaro.org
    Signed-off-by: Divyanshu-Modi <divyan.m05@gmail.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit f4dd9d9f8a6e9eb80615f3cf46b33a823f743e3a
Author: Thara Gopinath <thara.gopinath@linaro.org>
Date:   Fri Feb 21 19:52:08 2020 -0500

    arm64/topology: Populate arch_scale_thermal_pressure() for arm64 platforms

    Hook up topology_get_thermal_pressure to arch_scale_thermal_pressure thus
    enabling scheduler to retrieve instantaneous thermal pressure of a CPU.

    Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lkml.kernel.org/r/20200222005213.3873-5-thara.gopinath@linaro.org
    Signed-off-by: Divyanshu-Modi <divyan.m05@gmail.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 8d62e23c1aff88f3ea55d7bb1f8f69295e150189
Author: Thara Gopinath <thara.gopinath@linaro.org>
Date:   Fri Feb 21 19:52:07 2020 -0500

    drivers/base/arch_topology: Add infrastructure to store and update instantaneous thermal pressure

    Add architecture specific APIs to update and track thermal pressure on a
    per CPU basis. A per CPU variable thermal_pressure is introduced to keep
    track of instantaneous per CPU thermal pressure. Thermal pressure is the
    delta between maximum capacity and capped capacity due to a thermal event.

    topology_get_thermal_pressure can be hooked into the scheduler specified
    arch_scale_thermal_pressure to retrieve instantaneous thermal pressure of
    a CPU.

    arch_set_thermal_pressure can be used to update the thermal pressure.

    Considering topology_get_thermal_pressure reads thermal_pressure and
    arch_set_thermal_pressure writes into thermal_pressure, one can argue for
    some sort of locking mechanism to avoid a stale value.  But considering
    topology_get_thermal_pressure can be called from a system critical path
    like scheduler tick function, a locking mechanism is not ideal. This means
    that it is possible the thermal_pressure value used to calculate average
    thermal pressure for a CPU can be stale for up to 1 tick period.

    Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lkml.kernel.org/r/20200222005213.3873-4-thara.gopinath@linaro.org
    Signed-off-by: Divyanshu-Modi <divyan.m05@gmail.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 92727ae00ca5d23464f5d7ec7702bf62ab4ba026
Author: Thara Gopinath <thara.gopinath@linaro.org>
Date:   Fri Feb 21 19:52:06 2020 -0500

    sched/topology: Add callback to read per CPU thermal pressure

    Introduce the arch_scale_thermal_pressure() callback to retrieve per CPU thermal
    pressure.

    Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lkml.kernel.org/r/20200222005213.3873-3-thara.gopinath@linaro.org
    Signed-off-by: Divyanshu-Modi <divyan.m05@gmail.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 62abba1d3769cd0c8566f1d354035499fb6cb8a0
Author: Thara Gopinath <thara.gopinath@linaro.org>
Date:   Fri Feb 21 19:52:05 2020 -0500

    sched/pelt: Add support to track thermal pressure

    Extrapolating on the existing framework to track rt/dl utilization using
    pelt signals, add a similar mechanism to track thermal pressure. The
    difference here from rt/dl utilization tracking is that, instead of
    tracking time spent by a CPU running a RT/DL task through util_avg, the
    average thermal pressure is tracked through load_avg. This is because
    thermal pressure signal is weighted time "delta" capacity unlike util_avg
    which is binary. "delta capacity" here means delta between the actual
    capacity of a CPU and the decreased capacity a CPU due to a thermal event.

    In order to track average thermal pressure, a new sched_avg variable
    avg_thermal is introduced. Function update_thermal_load_avg can be called
    to do the periodic bookkeeping (accumulate, decay and average) of the
    thermal pressure.

    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lkml.kernel.org/r/20200222005213.3873-2-thara.gopinath@linaro.org
    Signed-off-by: Divyanshu-Modi <divyan.m05@gmail.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 82a4f68550bde32a72afb0085c8250c7dd672d21
Author: Lingutla Chandrasekhar <clingutla@codeaurora.org>
Date:   Thu Feb 4 15:52:03 2021 +0530

    sched: fair: consider all running tasks in cpu for load balance

    Load_balancer considers only cfs running tasks for finding busiest cpu
    to do load balancing. But cpu may be busy with other type tasks (ex: RT),
    then that cpu might not selected as busy cpu due to weight vs nr_run
    checks fails). And possibley cfs tasks running on that cpu would suffer
    till other type tasks finishes or weight checks passes, while other cpus
    sitting idle and not able to do load balance.

    So, consider all running tasks to check cpu busieness.

    Change-Id: Iddf3f668507e20359f6388fc30ff5897d234c902
    Signed-off-by: Lingutla Chandrasekhar <clingutla@codeaurora.org>
    Signed-off-by: atndko <z1281552865@gmail.com>
    Signed-off-by: Cyber Knight <cyberknight755@gmail.com>

commit 21028ad73e84230251c5704576d5c97eb4390e31
Author: Lucas Stach <l.stach@pengutronix.de>
Date:   Mon Aug 31 13:07:19 2020 +0200

    sched/deadline: Fix stale throttling on de-/boosted tasks

    When a boosted task gets throttled, what normally happens is that it's
    immediately enqueued again with ENQUEUE_REPLENISH, which replenishes the
    runtime and clears the dl_throttled flag. There is a special case however:
    if the throttling happened on sched-out and the task has been deboosted in
    the meantime, the replenish is skipped as the task will return to its
    normal scheduling class. This leaves the task with the dl_throttled flag
    set.

    Now if the task gets boosted up to the deadline scheduling class again
    while it is sleeping, it's still in the throttled state. The normal wakeup
    however will enqueue the task with ENQUEUE_REPLENISH not set, so we don't
    actually place it on the rq. Thus we end up with a task that is runnable,
    but not actually on the rq and neither a immediate replenishment happens,
    nor is the replenishment timer set up, so the task is stuck in
    forever-throttled limbo.

    Clear the dl_throttled flag before dropping back to the normal scheduling
    class to fix this issue.

    Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Acked-by: Juri Lelli <juri.lelli@redhat.com>
    Link: https://lkml.kernel.org/r/20200831110719.2126930-1-l.stach@pengutronix.de
    Signed-off-by: Zlatan Radovanovic <zlatan.radovanovic@fet.ba>
    Signed-off-by: Cyber Knight <cyberknight755@gmail.com>

commit 0541494a6aa9d50deb09f922e74c12b7d319e211
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Mon Sep 21 09:24:22 2020 +0200

    sched/fair: Reduce minimal imbalance threshold

    The 25% default imbalance threshold for DIE and NUMA domain is large
    enough to generate significant unfairness between threads. A typical
    example is the case of 11 threads running on 2x4 CPUs. The imbalance of
    20% between the 2 groups of 4 cores is just low enough to not trigger
    the load balance between the 2 groups. We will have always the same 6
    threads on one group of 4 CPUs and the other 5 threads on the other
    group of CPUS. With a fair time sharing in each group, we ends up with
    +20% running time for the group of 5 threads.

    Consider decreasing the imbalance threshold for overloaded case where we
    use the load to balance task and to ensure fair time sharing.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Phil Auld <pauld@redhat.com>
    Acked-by: Hillf Danton <hdanton@sina.com>
    Link: https://lkml.kernel.org/r/20200921072424.14813-3-vincent.guittot@linaro.org
    Signed-off-by: Zlatan Radovanovic <zlatan.radovanovic@fet.ba>
    Signed-off-by: Cyber Knight <cyberknight755@gmail.com>

commit ae5539ae9bd8eb9319a3a9773f1df1aa3daacbcb
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Mon Sep 21 09:24:24 2020 +0200

    sched/fair: Reduce busy load balance interval

    The busy_factor, which increases load balance interval when a cpu is busy,
    is set to 32 by default. This value generates some huge LB interval on
    large system like the THX2 made of 2 node x 28 cores x 4 threads.
    For such system, the interval increases from 112ms to 3584ms at MC level.
    And from 228ms to 7168ms at NUMA level.

    Even on smaller system, a lower busy factor has shown improvement on the
    fair distribution of the running time so let reduce it for all.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Phil Auld <pauld@redhat.com>
    Link: https://lkml.kernel.org/r/20200921072424.14813-5-vincent.guittot@linaro.org

commit e644e2307fbc7b33c1baae6e8e50517b0a5924c0
Author: Valentin Schneider <valentin.schneider@arm.com>
Date:   Wed Dec 11 11:38:50 2019 +0000

    BACKPORT: sched/fair: Make task_fits_capacity() consider uclamp restrictions

    task_fits_capacity() drives CPU selection at wakeup time, and is also used
    to detect misfit tasks. Right now it does so by comparing task_util_est()
    with a CPU's capacity, but doesn't take into account uclamp restrictions.

    There's a few interesting uses that can come out of doing this. For
    instance, a low uclamp.max value could prevent certain tasks from being
    flagged as misfit tasks, so they could merrily remain on low-capacity CPUs.
    Similarly, a high uclamp.min value would steer tasks towards high capacity
    CPUs at wakeup (and, should that fail, later steered via misfit balancing),
    so such "boosted" tasks would favor CPUs of higher capacity.

    Introduce uclamp_task_util() and make task_fits_capacity() use it.

    [QP: fixed missing dependency on fits_capacity() by using the open coded
    alternative]

    Bug: 120440300
    Tested-By: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Quentin Perret <qperret@google.com>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Link: https://lkml.kernel.org/r/20191211113851.24241-5-valentin.schneider@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit a7008c07a568278ed2763436404752a98004c7ff)
    Signed-off-by: Quentin Perret <qperret@google.com>
    Change-Id: Iabde2eda7252c3bcc273e61260a7a12a7de991b1
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit fed39a8a5b86100174489179f80a7514c7a85c4b
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Thu Sep 27 17:58:58 2018 +0100

    BACKPORT: ANDROID: sched/core: Move SchedTune task API into UtilClamp wrappers

    The main SchedTune API calls realted to task tuning attributes are now
    wrapped by more generic and mainlinish UtilClamp calls.

    The new APIs are:

     - uclamp_task(p)               <= boosted_task_util(p)
     - uclamp_boosted(p)            <= schedtune_task_boost(p) > 0
     - uclamp_latency_sensitive(p)  <= schedtune_prefer_idle(p)

    Let's provide also an implementation of the same API based on the new
    uclamp.uclamp_latency_sensitive flag.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    [Modified the patch to use uclamp.latency_sensitive instead mainline
    attributes]
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: Ib1a6902e1c07a82a370e36bf1776d895b7528cbc
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 0152bf348f6c517ba445393032bb9049c533aa27
Author: Rick Yiu <rickyiu@google.com>
Date:   Mon Jun 15 22:03:08 2020 +0800

    ANDROID: sched/tune: Consider stune boost margin when computing energy

    If CONFIG_SCHED_TUNE is enabled, it does not use boosted cpu util
    to compute energy, so it could not reflect the real freq when a
    cpu has boosted tasks on it. Addressing it by adding boost margin
    if type is FREQUENCY_UTIL in schedutil_cpu_util().

    Bug: 158637636
    Signed-off-by: Rick Yiu <rickyiu@google.com>
    Change-Id: I570920cb1e67d07de87006fca058d50e9358b7cd
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit a07859716117b26c7794a22ec3a4dec64b618ba0
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Thu Apr 4 10:24:43 2019 +0100

    ANDROID: sched/tune: Move SchedTune cpu API into UtilClamp wrappers

    The SchedTune CPU boosting API is currently used from sugov_get_util()
    to get the boosted utilization and to pass it into schedutil_cpu_util().

    When UtilClamp is in use instead we call schedutil_cpu_util() by
    passing in just the CFS utilization and the clamping is done internally
    on the aggregated CFS+RT utilization for FREQUENCY_UTIL calls.

    This asymmetry is not required moreover, schedutil code is polluted by
    non-mainline SchedTune code.

    Wrap SchedTune API call related to cpu utilization boosting with a more
    generic and mainlinish UtilClamp call:

     - uclamp_rq_util_with(cpu, util, p)  <= boosted_cpu_util(cpu)

    This new API is already used in schedutil_cpu_util() to clamp the
    aggregated RT+CFS utilization on FREQUENCY_UTIL calls.

    Move the cpu boosting into uclamp_rq_util_with() so that we remove any
    SchedTune specific bit from kernel/sched/cpufreq_schedutil.c.

    Get rid of the no more required boosted_cpu_util(cpu) method and replace
    it with a stune_util(cpu, util) which signature is better aligned with
    its uclamp_rq_util_with(cpu, util, p) counterpart.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I45b0f0f54123fe0a2515fa9f1683842e6b99234f
    [Removed superfluous __maybe_unused for capacity_orig_of]
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 75d6dd5a51250b259269afa979ee3fdd2f089db2
Author: Quentin Perret <qperret@google.com>
Date:   Thu Jun 10 15:13:06 2021 +0000

    ANDROID: sched/core: Make uclamp changes depend on CAP_SYS_NICE

    There is currently nothing preventing tasks from changing their per-task
    clamp values in anyway that they like. The rationale is probably that
    system administrators are still able to limit those clamps thanks to the
    cgroup interface. However, this causes pain in a system where both
    per-task and per-cgroup clamp values are expected to be under the
    control of core system components (as is the case for Android).

    To fix this, let's require CAP_SYS_NICE to change per-task clamp values.
    There are ongoing discussions upstream about more flexible approaches
    than this using the RLIMIT API -- see [1]. But the upstream discussion
    has not converged yet, and this is way too late for UAPI changes in
    android12-5.10 anyway, so let's apply this change which provides the
    behaviour we want without actually impacting UAPIs.

    [1] https://lore.kernel.org/lkml/20210623123441.592348-4-qperret@google.com/

    Bug: 187186685
    Signed-off-by: Quentin Perret <qperret@google.com>
    Change-Id: I749312a77306460318ac5374cf243d00b78120dd
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit b6e71f141280f787c584a8c597d7898dbff28d10
Author: Quentin Perret <qperret@google.com>
Date:   Thu Aug 5 11:21:53 2021 +0100

    BACKPORT: sched/uclamp: Fix UCLAMP_FLAG_IDLE setting

    The UCLAMP_FLAG_IDLE flag is set on a runqueue when dequeueing the last
    uclamp active task (that is, when buckets.tasks reaches 0 for all
    buckets) to maintain the last uclamp.max and prevent blocked util from
    suddenly becoming visible.

    However, there is an asymmetry in how the flag is set and cleared which
    can lead to having the flag set whilst there are active tasks on the rq.
    Specifically, the flag is cleared in the uclamp_rq_inc() path, which is
    called at enqueue time, but set in uclamp_rq_dec_id() which is called
    both when dequeueing a task _and_ in the update_uclamp_active() path. As
    a result, when both uclamp_rq_{dec,ind}_id() are called from
    update_uclamp_active(), the flag ends up being set but not cleared,
    hence leaving the runqueue in a broken state.

    Fix this by clearing the flag in update_uclamp_active() as well.

    Fixes: e496187da710 ("sched/uclamp: Enforce last task's UCLAMP_MAX")
    Reported-by: Rick Yiu <rickyiu@google.com>
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Qais Yousef <qais.yousef@arm.com>
    Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Link: https://lore.kernel.org/r/20210805102154.590709-2-qperret@google.com
    Signed-off-by: RuRuTiaSaMa <1009087450@qq.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit c0fda42ce6b4f1af0cd02cdf259aff2aaf8038cc
Author: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date:   Fri Nov 13 12:34:54 2020 +0100

    UPSTREAM: sched/uclamp: Allow to reset a task uclamp constraint value

    In case the user wants to stop controlling a uclamp constraint value
    for a task, use the magic value -1 in sched_util_{min,max} with the
    appropriate sched_flags (SCHED_FLAG_UTIL_CLAMP_{MIN,MAX}) to indicate
    the reset.

    The advantage over the 'additional flag' approach (i.e. introducing
    SCHED_FLAG_UTIL_CLAMP_RESET) is that no additional flag has to be
    exported via uapi. This avoids the need to document how this new flag
    has be used in conjunction with the existing uclamp related flags.

    The following subtle issue is fixed as well. When a uclamp constraint
    value is set on a !user_defined uclamp_se it is currently first reset
    and then set.
    Fix this by AND'ing !user_defined with !SCHED_FLAG_UTIL_CLAMP which
    stands for the 'sched class change' case.
    The related condition 'if (uc_se->user_defined)' moved from
    __setscheduler_uclamp() into uclamp_reset().

    Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Yun Hsiang <hsiang023167@gmail.com>
    Link: https://lkml.kernel.org/r/20201113113454.25868-1-dietmar.eggemann@arm.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit c2295e608322cc2ed46f7bc785e9e092813a630f
Author: YueHaibing <yuehaibing@huawei.com>
Date:   Tue Sep 22 21:24:10 2020 +0800

    UPSTREAM: sched/core: Remove unused inline function uclamp_bucket_base_value()

    There is no caller in tree, so can remove it.

    Signed-off-by: YueHaibing <yuehaibing@huawei.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Link: https://lkml.kernel.org/r/20200922132410.48440-1-yuehaibing@huawei.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 384e707b762fe96e3257dc45fdd06f102077e6f0
Author: Qinglang Miao <miaoqinglang@huawei.com>
Date:   Sat Jul 25 16:56:29 2020 +0800

    UPSTREAM: sched/uclamp: Remove unnecessary mutex_init()

    The uclamp_mutex lock is initialized statically via DEFINE_MUTEX(),
    it is unnecessary to initialize it runtime via mutex_init().

    Signed-off-by: Qinglang Miao <miaoqinglang@huawei.com>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Cc: Patrick Bellasi <patrick.bellasi@arm.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Link: https://lore.kernel.org/r/20200725085629.98292-1-miaoqinglang@huawei.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit a15a1c8215338f6381551dd7e42fe8103da1b104
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Thu Jul 16 12:03:45 2020 +0100

    UPSTREAM: sched/uclamp: Add a new sysctl to control RT default boost value

    RT tasks by default run at the highest capacity/performance level. When
    uclamp is selected this default behavior is retained by enforcing the
    requested uclamp.min (p->uclamp_req[UCLAMP_MIN]) of the RT tasks to be
    uclamp_none(UCLAMP_MAX), which is SCHED_CAPACITY_SCALE; the maximum
    value.

    This is also referred to as 'the default boost value of RT tasks'.

    See commit 1a00d999971c ("sched/uclamp: Set default clamps for RT tasks").

    On battery powered devices, it is desired to control this default
    (currently hardcoded) behavior at runtime to reduce energy consumed by
    RT tasks.

    For example, a mobile device manufacturer where big.LITTLE architecture
    is dominant, the performance of the little cores varies across SoCs, and
    on high end ones the big cores could be too power hungry.

    Given the diversity of SoCs, the new knob allows manufactures to tune
    the best performance/power for RT tasks for the particular hardware they
    run on.

    They could opt to further tune the value when the user selects
    a different power saving mode or when the device is actively charging.

    The runtime aspect of it further helps in creating a single kernel image
    that can be run on multiple devices that require different tuning.

    Keep in mind that a lot of RT tasks in the system are created by the
    kernel. On Android for instance I can see over 50 RT tasks, only
    a handful of which created by the Android framework.

    To control the default behavior globally by system admins and device
    integrator, introduce the new sysctl_sched_uclamp_util_min_rt_default
    to change the default boost value of the RT tasks.

    I anticipate this to be mostly in the form of modifying the init script
    of a particular device.

    To avoid polluting the fast path with unnecessary code, the approach
    taken is to synchronously do the update by traversing all the existing
    tasks in the system. This could race with a concurrent fork(), which is
    dealt with by introducing sched_post_fork() function which will ensure
    the racy fork will get the right update applied.

    Tested on Juno-r2 in combination with the RT capacity awareness [1].
    By default an RT task will go to the highest capacity CPU and run at the
    maximum frequency, which is particularly energy inefficient on high end
    mobile devices because the biggest core[s] are 'huge' and power hungry.

    With this patch the RT task can be controlled to run anywhere by
    default, and doesn't cause the frequency to be maximum all the time.
    Yet any task that really needs to be boosted can easily escape this
    default behavior by modifying its requested uclamp.min value
    (p->uclamp_req[UCLAMP_MIN]) via sched_setattr() syscall.

    [1] 804d402fb6f6: ("sched/rt: Make RT capacity-aware")

    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20200716110347.19553-2-qais.yousef@arm.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 7f15a8da33343d817d828b5ca95d207e4a89495f
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Thu Dec 2 11:20:33 2021 +0000

    UPSTREAM: sched/uclamp: Fix rq->uclamp_max not set on first enqueue

    Commit d81ae8aac85c ("sched/uclamp: Fix initialization of struct
    uclamp_rq") introduced a bug where uclamp_max of the rq is not reset to
    match the woken up task's uclamp_max when the rq is idle.

    The code was relying on rq->uclamp_max initialized to zero, so on first
    enqueue

    	static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
    					    enum uclamp_id clamp_id)
    	{
    		...

    		if (uc_se->value > READ_ONCE(uc_rq->value))
    			WRITE_ONCE(uc_rq->value, uc_se->value);
    	}

    was actually resetting it. But since commit d81ae8aac85c changed the
    default to 1024, this no longer works. And since rq->uclamp_flags is
    also initialized to 0, neither above code path nor uclamp_idle_reset()
    update the rq->uclamp_max on first wake up from idle.

    This is only visible from first wake up(s) until the first dequeue to
    idle after enabling the static key. And it only matters if the
    uclamp_max of this task is < 1024 since only then its uclamp_max will be
    effectively ignored.

    Fix it by properly initializing rq->uclamp_flags = UCLAMP_FLAG_IDLE to
    ensure uclamp_idle_reset() is called which then will update the rq
    uclamp_max value as expected.

    Bug: 254441685
    Fixes: d81ae8aac85c ("sched/uclamp: Fix initialization of struct uclamp_rq")
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Valentin Schneider <Valentin.Schneider@arm.com>
    Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Link: https://lkml.kernel.org/r/20211202112033.1705279-1-qais.yousef@arm.com
    (cherry picked from commit 315c4f884800c45cb6bd8c90422fad554a8b9588)
    Signed-off-by: Lee Jones <joneslee@google.com>
    Change-Id: I621fc463a3e51361516c2479aff6c80213aaf918
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 3aa7ab1653f0b9cb224c095c7f9cfa92782749f0
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Thu Jun 17 17:51:55 2021 +0100

    UPSTREAM: sched/uclamp: Fix uclamp_tg_restrict()

    Now cpu.uclamp.min acts as a protection, we need to make sure that the
    uclamp request of the task is within the allowed range of the cgroup,
    that is it is clamp()'ed correctly by tg->uclamp[UCLAMP_MIN] and
    tg->uclamp[UCLAMP_MAX].

    As reported by Xuewen [1] we can have some corner cases where there's
    inversion between uclamp requested by task (p) and the uclamp values of
    the taskgroup it's attached to (tg). Following table demonstrates
    2 corner cases:

    	           |  p  |  tg  |  effective
    	-----------+-----+------+-----------
    	CASE 1
    	-----------+-----+------+-----------
    	uclamp_min | 60% | 0%   |  60%
    	-----------+-----+------+-----------
    	uclamp_max | 80% | 50%  |  50%
    	-----------+-----+------+-----------
    	CASE 2
    	-----------+-----+------+-----------
    	uclamp_min | 0%  | 30%  |  30%
    	-----------+-----+------+-----------
    	uclamp_max | 20% | 50%  |  20%
    	-----------+-----+------+-----------

    With this fix we get:

    	           |  p  |  tg  |  effective
    	-----------+-----+------+-----------
    	CASE 1
    	-----------+-----+------+-----------
    	uclamp_min | 60% | 0%   |  50%
    	-----------+-----+------+-----------
    	uclamp_max | 80% | 50%  |  50%
    	-----------+-----+------+-----------
    	CASE 2
    	-----------+-----+------+-----------
    	uclamp_min | 0%  | 30%  |  30%
    	-----------+-----+------+-----------
    	uclamp_max | 20% | 50%  |  30%
    	-----------+-----+------+-----------

    Additionally uclamp_update_active_tasks() must now unconditionally
    update both UCLAMP_MIN/MAX because changing the tg's UCLAMP_MAX for
    instance could have an impact on the effective UCLAMP_MIN of the tasks.

    	           |  p  |  tg  |  effective
    	-----------+-----+------+-----------
    	old
    	-----------+-----+------+-----------
    	uclamp_min | 60% | 0%   |  50%
    	-----------+-----+------+-----------
    	uclamp_max | 80% | 50%  |  50%
    	-----------+-----+------+-----------
    	*new*
    	-----------+-----+------+-----------
    	uclamp_min | 60% | 0%   | *60%*
    	-----------+-----+------+-----------
    	uclamp_max | 80% |*70%* | *70%*
    	-----------+-----+------+-----------

    [1] https://lore.kernel.org/lkml/CAB8ipk_a6VFNjiEnHRHkUMBKbA+qzPQvhtNjJ_YNzQhqV_o8Zw@mail.gmail.com/

    Bug: 254441685
    Fixes: 0c18f2ecfcc2 ("sched/uclamp: Fix wrong implementation of cpu.uclamp.min")
    Reported-by: Xuewen Yan <xuewen.yan94@gmail.com>
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20210617165155.3774110-1-qais.yousef@arm.com
    (cherry picked from commit 0213b7083e81f4acd69db32cb72eb4e5f220329a)
    Signed-off-by: Lee Jones <joneslee@google.com>
    Change-Id: I128d75fea2900ec7bc360b44f18cada76c968578
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 02bc00f504e7174e58fbf90c176d37c91e1e759e
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Thu Jul 16 12:03:47 2020 +0100

    UPSTREAM: sched/uclamp: Fix a deadlock when enabling uclamp static key

    The following splat was caught when setting uclamp value of a task:

      BUG: sleeping function called from invalid context at ./include/linux/percpu-rwsem.h:49

       cpus_read_lock+0x68/0x130
       static_key_enable+0x1c/0x38
       __sched_setscheduler+0x900/0xad8

    Fix by ensuring we enable the key outside of the critical section in
    __sched_setscheduler()

    Bug: 254441685
    Fixes: 46609ce22703 ("sched/uclamp: Protect uclamp fast path code with static key")
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20200716110347.19553-4-qais.yousef@arm.com
    (cherry picked from commit e65855a52b479f98674998cb23b21ef5a8144b04)
    Signed-off-by: Lee Jones <joneslee@google.com>
    Change-Id: I9b33882f72b2f5a8bb8a1e077e7785f3462d1cee
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 557b3cd9f47d60ea7d5bf8be9f6d06e34786a8ec
Author: Xuewen Yan <xuewen.yan@unisoc.com>
Date:   Wed Jun 30 22:12:04 2021 +0800

    UPSTREAM: sched/uclamp: Ignore max aggregation if rq is idle

    When a task wakes up on an idle rq, uclamp_rq_util_with() would max
    aggregate with rq value. But since there is no task enqueued yet, the
    values are stale based on the last task that was running. When the new
    task actually wakes up and enqueued, then the rq uclamp values should
    reflect that of the newly woken up task effective uclamp values.

    This is a problem particularly for uclamp_max because it default to
    1024. If a task p with uclamp_max = 512 wakes up, then max aggregation
    would ignore the capping that should apply when this task is enqueued,
    which is wrong.

    Fix that by ignoring max aggregation if the rq is idle since in that
    case the effective uclamp value of the rq will be the ones of the task
    that will wake up.

    Bug: 254441685
    Fixes: 9d20ad7dfc9a ("sched/uclamp: Add uclamp_util_with()")
    Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
    [qias: Changelog]
    Reviewed-by: Qais Yousef <qais.yousef@arm.com>
    Link: https://lore.kernel.org/r/20210630141204.8197-1-xuewen.yan94@gmail.com
    (cherry picked from commit 3e1493f46390618ea78607cb30c58fc19e2a5035)
    Signed-off-by: Lee Jones <joneslee@google.com>
    Change-Id: I6ea180d854d9d8ffa94abdac4800c9cb130f77cf
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 04eeff8cf3fc25bde9471441080f3900ccb09c3a
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Mon May 10 15:50:32 2021 +0100

    UPSTREAM: sched/uclamp: Fix locking around cpu_util_update_eff()

    cpu_cgroup_css_online() calls cpu_util_update_eff() without holding the
    uclamp_mutex or rcu_read_lock() like other call sites, which is
    a mistake.

    The uclamp_mutex is required to protect against concurrent reads and
    writes that could update the cgroup hierarchy.

    The rcu_read_lock() is required to traverse the cgroup data structures
    in cpu_util_update_eff().

    Surround the caller with the required locks and add some asserts to
    better document the dependency in cpu_util_update_eff().

    Bug: 254441685
    Fixes: 7226017ad37a ("sched/uclamp: Fix a bug in propagating uclamp value in new cgroups")
    Reported-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20210510145032.1934078-3-qais.yousef@arm.com
    (cherry picked from commit 93b73858701fd01de26a4a874eb95f9b7156fd4b)
    Signed-off-by: Lee Jones <joneslee@google.com>
    Change-Id: I6b11073f23f58ce4c2415cdfc46140a60e3411a2
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit b60f7bed3b76a71ec848c5b690f6927bf36e2f0e
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Mon May 10 15:50:31 2021 +0100

    UPSTREAM: sched/uclamp: Fix wrong implementation of cpu.uclamp.min

    cpu.uclamp.min is a protection as described in cgroup-v2 Resource
    Distribution Model

    	Documentation/admin-guide/cgroup-v2.rst

    which means we try our best to preserve the minimum performance point of
    tasks in this group. See full description of cpu.uclamp.min in the
    cgroup-v2.rst.

    But the current implementation makes it a limit, which is not what was
    intended.

    For example:

    	tg->cpu.uclamp.min = 20%

    	p0->uclamp[UCLAMP_MIN] = 0
    	p1->uclamp[UCLAMP_MIN] = 50%

    	Previous Behavior (limit):

    		p0->effective_uclamp = 0
    		p1->effective_uclamp = 20%

    	New Behavior (Protection):

    		p0->effective_uclamp = 20%
    		p1->effective_uclamp = 50%

    Which is inline with how protections should work.

    With this change the cgroup and per-task behaviors are the same, as
    expected.

    Additionally, we remove the confusing relationship between cgroup and
    !user_defined flag.

    We don't want for example RT tasks that are boosted by default to max to
    change their boost value when they attach to a cgroup. If a cgroup wants
    to limit the max performance point of tasks attached to it, then
    cpu.uclamp.max must be set accordingly.

    Or if they want to set different boost value based on cgroup, then
    sysctl_sched_util_clamp_min_rt_default must be used to NOT boost to max
    and set the right cpu.uclamp.min for each group to let the RT tasks
    obtain the desired boost value when attached to that group.

    As it stands the dependency on !user_defined flag adds an extra layer of
    complexity that is not required now cpu.uclamp.min behaves properly as
    a protection.

    The propagation model of effective cpu.uclamp.min in child cgroups as
    implemented by cpu_util_update_eff() is still correct. The parent
    protection sets an upper limit of what the child cgroups will
    effectively get.

    Bug: 254441685
    Fixes: 3eac870a3247 (sched/uclamp: Use TG's clamps to restrict TASK's clamps)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20210510145032.1934078-2-qais.yousef@arm.com
    (cherry picked from commit 0c18f2ecfcc274a4bcc1d122f79ebd4001c3b445)
    Signed-off-by: Lee Jones <joneslee@google.com>
    Change-Id: I9f9f7b9e7ef3d19ccb1685f271639c9ed76b580f
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit b2135621698d06e6a73b0f4ef15529321cc68e31
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Tue Jun 30 12:21:23 2020 +0100

    BACKPORT: sched/uclamp: Protect uclamp fast path code with static key

    There is a report that when uclamp is enabled, a netperf UDP test
    regresses compared to a kernel compiled without uclamp.

    https://lore.kernel.org/lkml/20200529100806.GA3070@suse.de/

    While investigating the root cause, there were no sign that the uclamp
    code is doing anything particularly expensive but could suffer from bad
    cache behavior under certain circumstances that are yet to be
    understood.

    https://lore.kernel.org/lkml/20200616110824.dgkkbyapn3io6wik@e107158-lin/

    To reduce the pressure on the fast path anyway, add a static key that is
    by default will skip executing uclamp logic in the
    enqueue/dequeue_task() fast path until it's needed.

    As soon as the user start using util clamp by:

    	1. Changing uclamp value of a task with sched_setattr()
    	2. Modifying the default sysctl_sched_util_clamp_{min, max}
    	3. Modifying the default cpu.uclamp.{min, max} value in cgroup

    We flip the static key now that the user has opted to use util clamp.
    Effectively re-introducing uclamp logic in the enqueue/dequeue_task()
    fast path. It stays on from that point forward until the next reboot.

    This should help minimize the effect of util clamp on workloads that
    don't need it but still allow distros to ship their kernels with uclamp
    compiled in by default.

    SCHED_WARN_ON() in uclamp_rq_dec_id() was removed since now we can end
    up with unbalanced call to uclamp_rq_dec_id() if we flip the key while
    a task is running in the rq. Since we know it is harmless we just
    quietly return if we attempt a uclamp_rq_dec_id() when
    rq->uclamp[].bucket[].tasks is 0.

    In schedutil, we introduce a new uclamp_is_enabled() helper which takes
    the static key into account to ensure RT boosting behavior is retained.

    The following results demonstrates how this helps on 2 Sockets Xeon E5
    2x10-Cores system.

                                       nouclamp                 uclamp      uclamp-static-key
    Hmean     send-64         162.43 (   0.00%)      157.84 *  -2.82%*      163.39 *   0.59%*
    Hmean     send-128        324.71 (   0.00%)      314.78 *  -3.06%*      326.18 *   0.45%*
    Hmean     send-256        641.55 (   0.00%)      628.67 *  -2.01%*      648.12 *   1.02%*
    Hmean     send-1024      2525.28 (   0.00%)     2448.26 *  -3.05%*     2543.73 *   0.73%*
    Hmean     send-2048      4836.14 (   0.00%)     4712.08 *  -2.57%*     4867.69 *   0.65%*
    Hmean     send-3312      7540.83 (   0.00%)     7425.45 *  -1.53%*     7621.06 *   1.06%*
    Hmean     send-4096      9124.53 (   0.00%)     8948.82 *  -1.93%*     9276.25 *   1.66%*
    Hmean     send-8192     15589.67 (   0.00%)    15486.35 *  -0.66%*    15819.98 *   1.48%*
    Hmean     send-16384    26386.47 (   0.00%)    25752.25 *  -2.40%*    26773.74 *   1.47%*

    The perf diff between nouclamp and uclamp-static-key when uclamp is
    disabled in the fast path:

         8.73%     -1.55%  [kernel.kallsyms]        [k] try_to_wake_up
         0.07%     +0.04%  [kernel.kallsyms]        [k] deactivate_task
         0.13%     -0.02%  [kernel.kallsyms]        [k] activate_task

    The diff between nouclamp and uclamp-static-key when uclamp is enabled
    in the fast path:

         8.73%     -0.72%  [kernel.kallsyms]        [k] try_to_wake_up
         0.13%     +0.39%  [kernel.kallsyms]        [k] activate_task
         0.07%     +0.38%  [kernel.kallsyms]        [k] deactivate_task

    Bug: 254441685
    Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting")
    Reported-by: Mel Gorman <mgorman@suse.de>
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Tested-by: Lukasz Luba <lukasz.luba@arm.com>
    Link: https://lkml.kernel.org/r/20200630112123.12076-3-qais.yousef@arm.com
    (cherry picked from commit 46609ce227039fd192e0ecc7d940bed587fd2c78)
    Signed-off-by: Lee Jones <joneslee@google.com>
    Change-Id: I80555c22b856fbbd46692f83d501f03b6f393c35
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 322f697f698c34fd19271a1c21e756bbe0752aad
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Tue Jun 30 12:21:22 2020 +0100

    BACKPORT: sched/uclamp: Fix initialization of struct uclamp_rq

    struct uclamp_rq was zeroed out entirely in assumption that in the first
    call to uclamp_rq_inc() they'd be initialized correctly in accordance to
    default settings.

    But when next patch introduces a static key to skip
    uclamp_rq_{inc,dec}() until userspace opts in to use uclamp, schedutil
    will fail to perform any frequency changes because the
    rq->uclamp[UCLAMP_MAX].value is zeroed at init and stays as such. Which
    means all rqs are capped to 0 by default.

    Fix it by making sure we do proper initialization at init without
    relying on uclamp_rq_inc() doing it later.

    Bug: 254441685
    Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting")
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
    Tested-by: Lukasz Luba <lukasz.luba@arm.com>
    Link: https://lkml.kernel.org/r/20200630112123.12076-2-qais.yousef@arm.com
    (cherry picked from commit d81ae8aac85ca2e307d273f6dc7863a721bf054e)
    Signed-off-by: Lee Jones <joneslee@google.com>
    Change-Id: I014101dbe53c85f412f87f7f6937b18d2f141800
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 416ea5e54d1471df3352093b1aa0c3b7a4341a8f
Author: Quentin Perret <qperret@google.com>
Date:   Fri Apr 30 15:14:12 2021 +0000

    UPSTREAM: sched: Fix out-of-bound access in uclamp

    Util-clamp places tasks in different buckets based on their clamp values
    for performance reasons. However, the size of buckets is currently
    computed using a rounding division, which can lead to an off-by-one
    error in some configurations.

    For instance, with 20 buckets, the bucket size will be 1024/20=51. A
    task with a clamp of 1024 will be mapped to bucket id 1024/51=20. Sadly,
    correct indexes are in range [0,19], hence leading to an out of bound
    memory access.

    Clamp the bucket id to fix the issue.

    Bug: 186415778
    Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting")
    Suggested-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Link: https://lkml.kernel.org/r/20210430151412.160913-1-qperret@google.com
    (cherry picked from commit 6d2f8909a5fabb73fe2a63918117943986c39b6c)
    Signed-off-by: Quentin Perret <qperret@google.com>
    Change-Id: I8097f5ed34abcff36c5ed395643d65727ea969eb
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 41abff075e5f46de03fcb89a58b41a73c3f137a6
Author: Quentin Perret <qperret@google.com>
Date:   Thu Apr 16 09:59:56 2020 +0100

    BACKPORT: sched/core: Fix reset-on-fork from RT with uclamp

    uclamp_fork() resets the uclamp values to their default when the
    reset-on-fork flag is set. It also checks whether the task has a RT
    policy, and sets its uclamp.min to 1024 accordingly. However, during
    reset-on-fork, the task's policy is lowered to SCHED_NORMAL right after,
    hence leading to an erroneous uclamp.min setting for the new task if it
    was forked from RT.

    Fix this by removing the unnecessary check on rt_task() in
    uclamp_fork() as this doesn't make sense if the reset-on-fork flag is
    set.

    [ qperret: BACKPORT because of a conflict with the Android-specific
      SUGOV_RT_MAX_FREQ sched_feat, which is equally unnecessary in this
      path ]

    Bug: 120440300
    Fixes: 1a00d999971c ("sched/uclamp: Set default clamps for RT tasks")
    Reported-by: Chitti Babu Theegala <ctheegal@codeaurora.org>
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Patrick Bellasi <patrick.bellasi@matbug.net>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Link: https://lkml.kernel.org/r/20200416085956.217587-1-qperret@google.com
    (cherry picked from commit eaf5a92ebde5bca3bb2565616115bd6d579486cd)
    Signed-off-by: Quentin Perret <qperret@google.com>
    Change-Id: I9a19ac5474d0508b8437e4a1d859573b4106ed08
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit c719bd9543490474cb07d09d04cf7a01780c7a28
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Tue Jan 14 21:09:47 2020 +0000

    UPSTREAM: sched/uclamp: Reject negative values in cpu_uclamp_write()

    The check to ensure that the new written value into cpu.uclamp.{min,max}
    is within range, [0:100], wasn't working because of the signed
    comparison

     7301                 if (req.percent > UCLAMP_PERCENT_SCALE) {
     7302                         req.ret = -ERANGE;
     7303                         return req;
     7304                 }

    	# echo -1 > cpu.uclamp.min
    	# cat cpu.uclamp.min
    	42949671.96

    Cast req.percent into u64 to force the comparison to be unsigned and
    work as intended in capacity_from_percent().

    	# echo -1 > cpu.uclamp.min
    	sh: write error: Numerical result out of range

    Bug: 120440300
    Fixes: 2480c093130f ("sched/uclamp: Extend CPU's cgroup controller")
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lkml.kernel.org/r/20200114210947.14083-1-qais.yousef@arm.com
    (cherry picked from commit b562d140649966d4daedd0483a8fe59ad3bb465a)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I17fc2b119dcbffb212e130ed2c37ae3a8d5bbb61
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit bea7af6318b69aed941c04b64fb7aec6760692fa
Author: Quentin Perret <quentin.perret@arm.com>
Date:   Tue Jul 30 13:54:00 2019 +0100

    BACKPORT: ANDROID: sched/core: Add a latency-sensitive flag to uclamp

    Add a 'latency_sensitive' flag to uclamp in order to express the need
    for some tasks to find a CPU where they can wake-up quickly. This is not
    expected to be used without cgroup support, so add solely a cgroup
    interface for it.

    As this flag represents a boolean attribute and not an amount of
    resources to be shared, it is not clear what the delegation logic should
    be. As such, it is kept simple: every new cgroup starts with
    latency_sensitive set to false, regardless of the parent.

    In essence, this is similar to SchedTune's prefer-idle flag which was
    used in android-4.19 and prior.

    Bug: 120440300
    Change-Id: I722d8ecabb428bb7b95a5b54bc70a87f182dde2a
    Signed-off-by: Quentin Perret <quentin.perret@arm.com>
    (cherry picked from commit ad7dd648fc7dbe11f23673a3463af2468a274998)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit dcf228d2be609f20fb826ec420e1af7074b3265a
Author: Li Guanglei <guanglei.li@unisoc.com>
Date:   Wed Dec 25 15:44:04 2019 +0800

    FROMGIT: sched/core: Fix size of rq::uclamp initialization

    rq::uclamp is an array of struct uclamp_rq, make sure we clear the
    whole thing.

    Bug: 120440300
    Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcountinga")
    Signed-off-by: Li Guanglei <guanglei.li@unisoc.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Qais Yousef <qais.yousef@arm.com>
    Link: https://lkml.kernel.org/r/1577259844-12677-1-git-send-email-guangleix.li@gmail.com
    (cherry picked from commit dcd6dffb0a75741471297724640733fa4e958d72
     https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: Id36a2b77c45e586535e8fadfb7d66868ca8fe8c7
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 2b98fd7366175ec1903c23885f0357b51bc4c929
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Tue Dec 24 11:54:04 2019 +0000

    FROMGIT: sched/uclamp: Fix a bug in propagating uclamp value in new cgroups

    When a new cgroup is created, the effective uclamp value wasn't updated
    with a call to cpu_util_update_eff() that looks at the hierarchy and
    update to the most restrictive values.

    Fix it by ensuring to call cpu_util_update_eff() when a new cgroup
    becomes online.

    Without this change, the newly created cgroup uses the default
    root_task_group uclamp values, which is 1024 for both uclamp_{min, max},
    which will cause the rq to to be clamped to max, hence cause the
    system to run at max frequency.

    The problem was observed on Ubuntu server and was reproduced on Debian
    and Buildroot rootfs.

    By default, Ubuntu and Debian create a cpu controller cgroup hierarchy
    and add all tasks to it - which creates enough noise to keep the rq
    uclamp value at max most of the time. Imitating this behavior makes the
    problem visible in Buildroot too which otherwise looks fine since it's a
    minimal userspace.

    Bug: 120440300
    Fixes: 0b60ba2dd342 ("sched/uclamp: Propagate parent clamps")
    Reported-by: Doug Smythies <dsmythies@telus.net>
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Tested-by: Doug Smythies <dsmythies@telus.net>
    Link: https://lore.kernel.org/lkml/000701d5b965$361b6c60$a2524520$@net/
    (cherry picked from commit 7226017ad37a888915628e59a84a2d1e57b40707
     https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I9636c60e04d58bbfc5041df1305b34a12b5a3f46
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit ca1d456778b535f2b7aa6c473d4d67e4ece5a6d0
Author: Valentin Schneider <valentin.schneider@arm.com>
Date:   Wed Dec 11 11:38:49 2019 +0000

    FROMGIT: sched/uclamp: Rename uclamp_util_with() into uclamp_rq_util_with()

    The current helper returns (CPU) rq utilization with uclamp restrictions
    taken into account. A uclamp task utilization helper would be quite
    helpful, but this requires some renaming.

    Prepare the code for the introduction of a uclamp_task_util() by renaming
    the existing uclamp_util_with() to uclamp_rq_util_with().

    Bug: 120440300
    Tested-By: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Quentin Perret <qperret@google.com>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Link: https://lkml.kernel.org/r/20191211113851.24241-4-valentin.schneider@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit d2b58a286e89824900d501db0be1d4f6aed474fc
     https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I3e7146b788e079e400167203df5e5dadee2fd232
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 3b187fe5f97ec4e11f313001b70ce174b068efb8
Author: Valentin Schneider <valentin.schneider@arm.com>
Date:   Wed Dec 11 11:38:48 2019 +0000

    FROMGIT: sched/uclamp: Make uclamp util helpers use and return UL values

    Vincent pointed out recently that the canonical type for utilization
    values is 'unsigned long'. Internally uclamp uses 'unsigned int' values for
    cache optimization, but this doesn't have to be exported to its users.

    Make the uclamp helpers that deal with utilization use and return unsigned
    long values.

    Bug: 120440300
    Tested-By: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Quentin Perret <qperret@google.com>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Link: https://lkml.kernel.org/r/20191211113851.24241-3-valentin.schneider@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 686516b55e98edf18c2a02d36aaaa6f4c0f6c39c
     https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: Id3837f12237e5b77eb3a236bd32457dcd7de743e
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 9f7cbbc57004a73d8550358a95c803f49a4067bb
Author: Valentin Schneider <valentin.schneider@arm.com>
Date:   Wed Dec 11 11:38:47 2019 +0000

    FROMGIT: sched/uclamp: Remove uclamp_util()

    The sole user of uclamp_util(), schedutil_cpu_util(), was made to use
    uclamp_util_with() instead in commit:

      af24bde8df20 ("sched/uclamp: Add uclamp support to energy_compute()")

    From then on, uclamp_util() has remained unused. Being a simple wrapper
    around uclamp_util_with(), we can get rid of it and win back a few lines.

    Bug: 120440300
    Tested-By: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Link: https://lkml.kernel.org/r/20191211113851.24241-2-valentin.schneider@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 59fe675248ffc37d4167e9ec6920a2f3d5ec67bb
     https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I11dbff80c6c4be9666438800b2527aca8cd24025
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 1e2856646d5f69ab499a1c5cd9f2e3376b7a1ef9
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Wed Oct 9 11:46:11 2019 +0100

    BACKPORT: sched/rt: Make RT capacity-aware

    Capacity Awareness refers to the fact that on heterogeneous systems
    (like Arm big.LITTLE), the capacity of the CPUs is not uniform, hence
    when placing tasks we need to be aware of this difference of CPU
    capacities.

    In such scenarios we want to ensure that the selected CPU has enough
    capacity to meet the requirement of the running task. Enough capacity
    means here that capacity_orig_of(cpu) >= task.requirement.

    The definition of task.requirement is dependent on the scheduling class.

    For CFS, utilization is used to select a CPU that has >= capacity value
    than the cfs_task.util.

    	capacity_orig_of(cpu) >= cfs_task.util

    DL isn't capacity aware at the moment but can make use of the bandwidth
    reservation to implement that in a similar manner CFS uses utilization.
    The following patchset implements that:

    https://lore.kernel.org/lkml/20190506044836.2914-1-luca.abeni@santannapisa.it/

    	capacity_orig_of(cpu)/SCHED_CAPACITY >= dl_deadline/dl_runtime

    For RT we don't have a per task utilization signal and we lack any
    information in general about what performance requirement the RT task
    needs. But with the introduction of uclamp, RT tasks can now control
    that by setting uclamp_min to guarantee a minimum performance point.

    ATM the uclamp value are only used for frequency selection; but on
    heterogeneous systems this is not enough and we need to ensure that the
    capacity of the CPU is >= uclamp_min. Which is what implemented here.

    	capacity_orig_of(cpu) >= rt_task.uclamp_min

    Note that by default uclamp.min is 1024, which means that RT tasks will
    always be biased towards the big CPUs, which make for a better more
    predictable behavior for the default case.

    Must stress that the bias acts as a hint rather than a definite
    placement strategy. For example, if all big cores are busy executing
    other RT tasks we can't guarantee that a new RT task will be placed
    there.

    On non-heterogeneous systems the original behavior of RT should be
    retained. Similarly if uclamp is not selected in the config.

    [ mingo: Minor edits to comments. ]

    Bug: 120440300
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Link: https://lkml.kernel.org/r/20191009104611.15363-1-qais.yousef@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 804d402fb6f6487b825aae8cf42fda6426c62867
     https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git)
    [Qais: resolved minor conflict in kernel/sched/cpupri.c]
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: Ifc9da1c47de1aec9b4d87be2614e4c8968366900
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit fbcf2f9dcec79826507ced228a9278ae404c2a8a
Author: Valentin Schneider <valentin.schneider@arm.com>
Date:   Fri Nov 15 10:39:08 2019 +0000

    UPSTREAM: sched/uclamp: Fix overzealous type replacement

    Some uclamp helpers had their return type changed from 'unsigned int' to
    'enum uclamp_id' by commit

      0413d7f33e60 ("sched/uclamp: Always use 'enum uclamp_id' for clamp_id values")

    but it happens that some do return a value in the [0, SCHED_CAPACITY_SCALE]
    range, which should really be unsigned int. The affected helpers are
    uclamp_none(), uclamp_rq_max_value() and uclamp_eff_value(). Fix those up.

    Note that this doesn't lead to any obj diff using a relatively recent
    aarch64 compiler (8.3-2019.03). The current code of e.g. uclamp_eff_value()
    properly returns an 11 bit value (bits_per(1024)) and doesn't seem to do
    anything funny. I'm still marking this as fixing the above commit to be on
    the safe side.

    Bug: 120440300
    Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
    Reviewed-by: Qais Yousef <qais.yousef@arm.com>
    Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Dietmar.Eggemann@arm.com
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: patrick.bellasi@matbug.net
    Cc: qperret@google.com
    Cc: surenb@google.com
    Cc: tj@kernel.org
    Fixes: 0413d7f33e60 ("sched/uclamp: Always use 'enum uclamp_id' for clamp_id values")
    Link: https://lkml.kernel.org/r/20191115103908.27610-1-valentin.schneider@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 7763baace1b738d65efa46d68326c9406311c6bf)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I924a99c125372a8fca81cb4bc0c82e6a7183fc8a
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit b52d3d696bf2b1aecece38539b2fcf4f7f8445f5
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Thu Nov 14 21:10:52 2019 +0000

    UPSTREAM: sched/uclamp: Fix incorrect condition

    uclamp_update_active() should perform the update when
    p->uclamp[clamp_id].active is true. But when the logic was inverted in
    [1], the if condition wasn't inverted correctly too.

    [1] https://lore.kernel.org/lkml/20190902073836.GO2369@hirez.programming.kicks-ass.net/

    Bug: 120440300
    Reported-by: Suren Baghdasaryan <surenb@google.com>
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Ben Segall <bsegall@google.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Mel Gorman <mgorman@suse.de>
    Cc: Patrick Bellasi <patrick.bellasi@matbug.net>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Steven Rostedt <rostedt@goodmis.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Fixes: babbe170e053 ("sched/uclamp: Update CPU's refcount on TG's clamp changes")
    Link: https://lkml.kernel.org/r/20191114211052.15116-1-qais.yousef@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 6e1ff0773f49c7d38e8b4a9df598def6afb9f415)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I51b58a6089290277e08a0aaa72b86f852eec1512
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit bbe475a9c1f66f4da03c5870aff5883fed09eb77
Author: Qais Yousef <qais.yousef@arm.com>
Date:   Tue Nov 5 11:22:12 2019 +0000

    UPSTREAM: sched/core: Fix compilation error when cgroup not selected

    When cgroup is disabled the following compilation error was hit

    	kernel/sched/core.c: In function ‘uclamp_update_active_tasks’:
    	kernel/sched/core.c:1081:23: error: storage size of ‘it’ isn’t known
    	  struct css_task_iter it;
    			       ^~
    	kernel/sched/core.c:1084:2: error: implicit declaration of function ‘css_task_iter_start’; did you mean ‘__sg_page_iter_start’? [-Werror=implicit-function-declaration]
    	  css_task_iter_start(css, 0, &it);
    	  ^~~~~~~~~~~~~~~~~~~
    	  __sg_page_iter_start
    	kernel/sched/core.c:1085:14: error: implicit declaration of function ‘css_task_iter_next’; did you mean ‘__sg_page_iter_next’? [-Werror=implicit-function-declaration]
    	  while ((p = css_task_iter_next(&it))) {
    		      ^~~~~~~~~~~~~~~~~~
    		      __sg_page_iter_next
    	kernel/sched/core.c:1091:2: error: implicit declaration of function ‘css_task_iter_end’; did you mean ‘get_task_cred’? [-Werror=implicit-function-declaration]
    	  css_task_iter_end(&it);
    	  ^~~~~~~~~~~~~~~~~
    	  get_task_cred
    	kernel/sched/core.c:1081:23: warning: unused variable ‘it’ [-Wunused-variable]
    	  struct css_task_iter it;
    			       ^~
    	cc1: some warnings being treated as errors
    	make[2]: *** [kernel/sched/core.o] Error 1

    Fix by protetion uclamp_update_active_tasks() with
    CONFIG_UCLAMP_TASK_GROUP

    Bug: 120440300
    Fixes: babbe170e053 ("sched/uclamp: Update CPU's refcount on TG's clamp changes")
    Reported-by: Randy Dunlap <rdunlap@infradead.org>
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Tested-by: Randy Dunlap <rdunlap@infradead.org>
    Cc: Steven Rostedt <rostedt@goodmis.org>
    Cc: Ingo Molnar <mingo@redhat.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Patrick Bellasi <patrick.bellasi@matbug.net>
    Cc: Mel Gorman <mgorman@suse.de>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Ben Segall <bsegall@google.com>
    Link: https://lkml.kernel.org/r/20191105112212.596-1-qais.yousef@arm.com
    (cherry picked from commit e3b8b6a0d12cccf772113d6b5c1875192186fbd4)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: Ia4c0f801d68050526f9f117ec9189e448b01345a
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 35f8e35625c692abfc81b698e5112058d37dacad
Author: Ingo Molnar <mingo@kernel.org>
Date:   Wed Sep 4 09:55:32 2019 +0200

    UPSTREAM: sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code

    Thadeu Lima de Souza Cascardo reported that 'chrt' broke on recent kernels:

      $ chrt -p $$
      chrt: failed to get pid 26306's policy: Argument list too long

    and he has root-caused the bug to the following commit increasing sched_attr
    size and breaking sched_read_attr() into returning -EFBIG:

      a509a7cd7974 ("sched/uclamp: Extend sched_setattr() to support utilization clamping")

    The other, bigger bug is that the whole sched_getattr() and sched_read_attr()
    logic of checking non-zero bits in new ABI components is arguably broken,
    and pretty much any extension of the ABI will spuriously break the ABI.
    That's way too fragile.

    Instead implement the perf syscall's extensible ABI instead, which we
    already implement on the sched_setattr() side:

     - if user-attributes have the same size as kernel attributes then the
       logic is unchanged.

     - if user-attributes are larger than the kernel knows about then simply
       skip the extra bits, but set attr->size to the (smaller) kernel size
       so that tooling can (in principle) handle older kernel as well.

     - if user-attributes are smaller than the kernel knows about then just
       copy whatever user-space can accept.

    Also clean up the whole logic:

     - Simplify the code flow - there's no need for 'ret' for example.

     - Standardize on 'kattr/uattr' and 'ksize/usize' naming to make sure we
       always know which side we are dealing with.

     - Why is it called 'read' when what it does is to copy to user? This
       code is so far away from VFS read() semantics that the naming is
       actively confusing. Name it sched_attr_copy_to_user() instead, which
       mirrors other copy_to_user() functionality.

     - Move the attr->size assignment from the head of sched_getattr() to the
       sched_attr_copy_to_user() function. Nothing else within the kernel
       should care about the size of the structure.

    With these fixes the sched_getattr() syscall now nicely supports an
    extensible ABI in both a forward and backward compatible fashion, and
    will also fix the chrt bug.

    As an added bonus the bogus -EFBIG return is removed as well, which as
    Thadeu noted should have been -E2BIG to begin with.

    Bug: 120440300
    Reported-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
    Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Tested-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
    Acked-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
    Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
    Cc: Jiri Olsa <jolsa@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Patrick Bellasi <patrick.bellasi@arm.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Fixes: a509a7cd7974 ("sched/uclamp: Extend sched_setattr() to support utilization clamping")
    Link: https://lkml.kernel.org/r/20190904075532.GA26751@gmail.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 1251201c0d34fadf69d56efa675c2b7dd0a90eca)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I67e653c4f69db0140e9651c125b60e2b8cfd62f1
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 2e5fab725ddfc0db94558a0eb68fbb7584ab1a93
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Thu Aug 22 14:28:11 2019 +0100

    UPSTREAM: sched/uclamp: Always use 'enum uclamp_id' for clamp_id values

    The supported clamp indexes are defined in 'enum clamp_id', however, because
    of the code logic in some of the first utilization clamping series version,
    sometimes we needed to use 'unsigned int' to represent indices.

    This is not more required since the final version of the uclamp_* APIs can
    always use the proper enum uclamp_id type.

    Fix it with a bulk rename now that we have all the bits merged.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Michal Koutny <mkoutny@suse.com>
    Acked-by: Tejun Heo <tj@kernel.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190822132811.31294-7-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 0413d7f33e60751570fd6c179546bde2f7d82dcb)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I0be680b2489fa07244bac63b5c6fe1a79a53bef7
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 27326e00f48ecb7241256b777090282295061a10
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Thu Aug 22 14:28:10 2019 +0100

    UPSTREAM: sched/uclamp: Update CPU's refcount on TG's clamp changes

    On updates of task group (TG) clamp values, ensure that these new values
    are enforced on all RUNNABLE tasks of the task group, i.e. all RUNNABLE
    tasks are immediately boosted and/or capped as requested.

    Do that each time we update effective clamps from cpu_util_update_eff().
    Use the *cgroup_subsys_state (css) to walk the list of tasks in each
    affected TG and update their RUNNABLE tasks.
    Update each task by using the same mechanism used for cpu affinity masks
    updates, i.e. by taking the rq lock.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Michal Koutny <mkoutny@suse.com>
    Acked-by: Tejun Heo <tj@kernel.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190822132811.31294-6-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit babbe170e053c6ec2343751749995b7b9fd5fd2c)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I5e48891bd48c266dd282e1bab8f60533e4e29b48
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit e66ba8289ce3a479f0c6c52d25307c4d2b22f256
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Thu Aug 22 14:28:09 2019 +0100

    UPSTREAM: sched/uclamp: Use TG's clamps to restrict TASK's clamps

    When a task specific clamp value is configured via sched_setattr(2), this
    value is accounted in the corresponding clamp bucket every time the task is
    {en,de}qeued. However, when cgroups are also in use, the task specific
    clamp values could be restricted by the task_group (TG) clamp values.

    Update uclamp_cpu_inc() to aggregate task and TG clamp values. Every time a
    task is enqueued, it's accounted in the clamp bucket tracking the smaller
    clamp between the task specific value and its TG effective value. This
    allows to:

    1. ensure cgroup clamps are always used to restrict task specific requests,
       i.e. boosted not more than its TG effective protection and capped at
       least as its TG effective limit.

    2. implement a "nice-like" policy, where tasks are still allowed to request
       less than what enforced by their TG effective limits and protections

    Do this by exploiting the concept of "effective" clamp, which is already
    used by a TG to track parent enforced restrictions.

    Apply task group clamp restrictions only to tasks belonging to a child
    group. While, for tasks in the root group or in an autogroup, system
    defaults are still enforced.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Michal Koutny <mkoutny@suse.com>
    Acked-by: Tejun Heo <tj@kernel.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190822132811.31294-5-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 3eac870a324728e5d17118888840dad70bcd37f3)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I0215e0a68cc0fa7c441e33052757f8571b7c99b9
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit d26df4b554e45013689e091881875e767cc55122
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Thu Aug 22 14:28:08 2019 +0100

    UPSTREAM: sched/uclamp: Propagate system defaults to the root group

    The clamp values are not tunable at the level of the root task group.
    That's for two main reasons:

     - the root group represents "system resources" which are always
       entirely available from the cgroup standpoint.

     - when tuning/restricting "system resources" makes sense, tuning must
       be done using a system wide API which should also be available when
       control groups are not.

    When a system wide restriction is available, cgroups should be aware of
    its value in order to know exactly how much "system resources" are
    available for the subgroups.

    Utilization clamping supports already the concepts of:

     - system defaults: which define the maximum possible clamp values
       usable by tasks.

     - effective clamps: which allows a parent cgroup to constraint (maybe
       temporarily) its descendants without losing the information related
       to the values "requested" from them.

    Exploit these two concepts and bind them together in such a way that,
    whenever system default are tuned, the new values are propagated to
    (possibly) restrict or relax the "effective" value of nested cgroups.

    When cgroups are in use, force an update of all the RUNNABLE tasks.
    Otherwise, keep things simple and do just a lazy update next time each
    task will be enqueued.
    Do that since we assume a more strict resource control is required when
    cgroups are in use. This allows also to keep "effective" clamp values
    updated in case we need to expose them to user-space.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Michal Koutny <mkoutny@suse.com>
    Acked-by: Tejun Heo <tj@kernel.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190822132811.31294-4-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 7274a5c1bbec45f06f1fff4b8c8b5855b6cc189d)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: Ibf7ce5c46b67c79765b56b792ee22ed9595802c3
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 46b50a614c89320b23ab7615fb3cd92083c38bcf
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Thu Aug 22 14:28:07 2019 +0100

    UPSTREAM: sched/uclamp: Propagate parent clamps

    In order to properly support hierarchical resources control, the cgroup
    delegation model requires that attribute writes from a child group never
    fail but still are locally consistent and constrained based on parent's
    assigned resources. This requires to properly propagate and aggregate
    parent attributes down to its descendants.

    Implement this mechanism by adding a new "effective" clamp value for each
    task group. The effective clamp value is defined as the smaller value
    between the clamp value of a group and the effective clamp value of its
    parent. This is the actual clamp value enforced on tasks in a task group.

    Since it's possible for a cpu.uclamp.min value to be bigger than the
    cpu.uclamp.max value, ensure local consistency by restricting each
    "protection" (i.e. min utilization) with the corresponding "limit"
    (i.e. max utilization).

    Do that at effective clamps propagation to ensure all user-space write
    never fails while still always tracking the most restrictive values.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Michal Koutny <mkoutny@suse.com>
    Acked-by: Tejun Heo <tj@kernel.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190822132811.31294-3-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 0b60ba2dd342016e4e717dbaa4ca9af3a43f4434)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: If1cc136e1fb4a8f4c6ea15dc440b28d833a8d7e7
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 067c0b19313dc9e3730abf9ceb5942420a2c3fdd
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Thu Aug 22 14:28:06 2019 +0100

    BACKPORT: sched/uclamp: Extend CPU's cgroup controller

    The cgroup CPU bandwidth controller allows to assign a specified
    (maximum) bandwidth to the tasks of a group. However this bandwidth is
    defined and enforced only on a temporal base, without considering the
    actual frequency a CPU is running on. Thus, the amount of computation
    completed by a task within an allocated bandwidth can be very different
    depending on the actual frequency the CPU is running that task.
    The amount of computation can be affected also by the specific CPU a
    task is running on, especially when running on asymmetric capacity
    systems like Arm's big.LITTLE.

    With the availability of schedutil, the scheduler is now able
    to drive frequency selections based on actual task utilization.
    Moreover, the utilization clamping support provides a mechanism to
    bias the frequency selection operated by schedutil depending on
    constraints assigned to the tasks currently RUNNABLE on a CPU.

    Giving the mechanisms described above, it is now possible to extend the
    cpu controller to specify the minimum (or maximum) utilization which
    should be considered for tasks RUNNABLE on a cpu.
    This makes it possible to better defined the actual computational
    power assigned to task groups, thus improving the cgroup CPU bandwidth
    controller which is currently based just on time constraints.

    Extend the CPU controller with a couple of new attributes uclamp.{min,max}
    which allow to enforce utilization boosting and capping for all the
    tasks in a group.

    Specifically:

    - uclamp.min: defines the minimum utilization which should be considered
    	      i.e. the RUNNABLE tasks of this group will run at least at a
    	      minimum frequency which corresponds to the uclamp.min
    	      utilization

    - uclamp.max: defines the maximum utilization which should be considered
    	      i.e. the RUNNABLE tasks of this group will run up to a
    	      maximum frequency which corresponds to the uclamp.max
    	      utilization

    These attributes:

    a) are available only for non-root nodes, both on default and legacy
       hierarchies, while system wide clamps are defined by a generic
       interface which does not depends on cgroups. This system wide
       interface enforces constraints on tasks in the root node.

    b) enforce effective constraints at each level of the hierarchy which
       are a restriction of the group requests considering its parent's
       effective constraints. Root group effective constraints are defined
       by the system wide interface.
       This mechanism allows each (non-root) level of the hierarchy to:
       - request whatever clamp values it would like to get
       - effectively get only up to the maximum amount allowed by its parent

    c) have higher priority than task-specific clamps, defined via
       sched_setattr(), thus allowing to control and restrict task requests.

    Add two new attributes to the cpu controller to collect "requested"
    clamp values. Allow that at each non-root level of the hierarchy.
    Keep it simple by not caring now about "effective" values computation
    and propagation along the hierarchy.

    Update sysctl_sched_uclamp_handler() to use the newly introduced
    uclamp_mutex so that we serialize system default updates with cgroup
    relate updates.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Michal Koutny <mkoutny@suse.com>
    Acked-by: Tejun Heo <tj@kernel.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190822132811.31294-2-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 2480c093130f64ac3a410504fa8b3db1fc4b87ce)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I0285c44910bf073b80d7996361e6698bc5aedfae
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 64b042aac4283cf4456f213f3546daee0c731785
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Fri Jun 21 09:42:11 2019 +0100

    UPSTREAM: sched/uclamp: Add uclamp_util_with()

    So far uclamp_util() allows to clamp a specified utilization considering
    the clamp values requested by RUNNABLE tasks in a CPU. For the Energy
    Aware Scheduler (EAS) it is interesting to test how clamp values will
    change when a task is becoming RUNNABLE on a given CPU.
    For example, EAS is interested in comparing the energy impact of
    different scheduling decisions and the clamp values can play a role on
    that.

    Add uclamp_util_with() which allows to clamp a given utilization by
    considering the possible impact on CPU clamp values of a specified task.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190621084217.8167-11-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 9d20ad7dfc9a5cc64e33d725902d3863d350a66a)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: Ida153a3526b87f5674a6e037d4725d99eec7b478
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 0fc858cc7a9712c08da29adc187e69279768a6e4
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Fri Jun 21 09:42:10 2019 +0100

    BACKPORT: sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks

    Each time a frequency update is required via schedutil, a frequency is
    selected to (possibly) satisfy the utilization reported by each
    scheduling class and irqs. However, when utilization clamping is in use,
    the frequency selection should consider userspace utilization clamping
    hints.  This will allow, for example, to:

     - boost tasks which are directly affecting the user experience
       by running them at least at a minimum "requested" frequency

     - cap low priority tasks not directly affecting the user experience
       by running them only up to a maximum "allowed" frequency

    These constraints are meant to support a per-task based tuning of the
    frequency selection thus supporting a fine grained definition of
    performance boosting vs energy saving strategies in kernel space.

    Add support to clamp the utilization of RUNNABLE FAIR and RT tasks
    within the boundaries defined by their aggregated utilization clamp
    constraints.

    Do that by considering the max(min_util, max_util) to give boosted tasks
    the performance they need even when they happen to be co-scheduled with
    other capped tasks.

    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190621084217.8167-10-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 982d9cdc22c9f6df5ad790caa229ff74fb1d95e7)

    Conflicts:
    	kernel/sched/cpufreq_schedutil.c

    	1. Merged the if condition to include the non-upstream
    	   sched_feat(SUGOV_RT_MAX_FREQ) check

    	2. Change the function signature to pass util_cfs and define
    	   util as an automatic variable.

    Bug: 120440300
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: Ie222c9ad84776fc2948e30c116eee876df697a17
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit c927abaedd42cf29f8e3dc92613e84c4375e4773
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Fri Jun 21 09:42:09 2019 +0100

    UPSTREAM: sched/uclamp: Set default clamps for RT tasks

    By default FAIR tasks start without clamps, i.e. neither boosted nor
    capped, and they run at the best frequency matching their utilization
    demand.  This default behavior does not fit RT tasks which instead are
    expected to run at the maximum available frequency, if not otherwise
    required by explicitly capping them.

    Enforce the correct behavior for RT tasks by setting util_min to max
    whenever:

     1. the task is switched to the RT class and it does not already have a
        user-defined clamp value assigned.

     2. an RT task is forked from a parent with RESET_ON_FORK set.

    NOTE: utilization clamp values are cross scheduling class attributes and
    thus they are never changed/reset once a value has been explicitly
    defined from user-space.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190621084217.8167-9-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 1a00d999971c78ab024a17b0efc37d78404dd120)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I81fcadaea34f557e531fa5ac6aab84fcb0ee37c7
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit bd6028f582359eac294fe7363206330696c84b9c
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Fri Jun 21 09:42:08 2019 +0100

    UPSTREAM: sched/uclamp: Reset uclamp values on RESET_ON_FORK

    A forked tasks gets the same clamp values of its parent however, when
    the RESET_ON_FORK flag is set on parent, e.g. via:

       sys_sched_setattr()
          sched_setattr()
             __sched_setscheduler(attr::SCHED_FLAG_RESET_ON_FORK)

    the new forked task is expected to start with all attributes reset to
    default values.

    Do that for utilization clamp values too by checking the reset request
    from the existing uclamp_fork() call which already provides the required
    initialization for other uclamp related bits.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190621084217.8167-8-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit a87498ace58e23b62a572dc7267579ede4c8495c)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: If7bda202707aac3a2696a42f8146f607cdd36905
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit c713ddf1b1d0c7fac555eacf8579b8ffd5bdd96d
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Fri Jun 21 09:42:07 2019 +0100

    BACKPORT: sched/uclamp: Extend sched_setattr() to support utilization clamping

    The SCHED_DEADLINE scheduling class provides an advanced and formal
    model to define tasks requirements that can translate into proper
    decisions for both task placements and frequencies selections. Other
    classes have a more simplified model based on the POSIX concept of
    priorities.

    Such a simple priority based model however does not allow to exploit
    most advanced features of the Linux scheduler like, for example, driving
    frequencies selection via the schedutil cpufreq governor. However, also
    for non SCHED_DEADLINE tasks, it's still interesting to define tasks
    properties to support scheduler decisions.

    Utilization clamping exposes to user-space a new set of per-task
    attributes the scheduler can use as hints about the expected/required
    utilization for a task. This allows to implement a "proactive" per-task
    frequency control policy, a more advanced policy than the current one
    based just on "passive" measured task utilization. For example, it's
    possible to boost interactive tasks (e.g. to get better performance) or
    cap background tasks (e.g. to be more energy/thermal efficient).

    Introduce a new API to set utilization clamping values for a specified
    task by extending sched_setattr(), a syscall which already allows to
    define task specific properties for different scheduling classes. A new
    pair of attributes allows to specify a minimum and maximum utilization
    the scheduler can consider for a task.

    Do that by validating the required clamp values before and then applying
    the required changes using _the_ same pattern already in use for
    __setscheduler(). This ensures that the task is re-enqueued with the new
    clamp values.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190621084217.8167-7-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit a509a7cd79747074a2c018a45bbbc52d1f4aed44)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I420e7ece5628bc639811a79654c35135a65bfd02
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 6306516692f50c3a2d2119142e393f5ef9f48259
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Fri Jun 21 09:42:06 2019 +0100

    BACKPORT: sched/core: Allow sched_setattr() to use the current policy

    The sched_setattr() syscall mandates that a policy is always specified.
    This requires to always know which policy a task will have when
    attributes are configured and this makes it impossible to add more
    generic task attributes valid across different scheduling policies.
    Reading the policy before setting generic tasks attributes is racy since
    we cannot be sure it is not changed concurrently.

    Introduce the required support to change generic task attributes without
    affecting the current task policy. This is done by adding an attribute flag
    (SCHED_FLAG_KEEP_POLICY) to enforce the usage of the current policy.

    Add support for the SETPARAM_POLICY policy, which is already used by the
    sched_setparam() POSIX syscall, to the sched_setattr() non-POSIX
    syscall.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190621084217.8167-6-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 1d6362fa0cfc8c7b243fa92924429d826599e691)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I41cbe73d7aa30123adbd757fa30e346938651784
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 62d50c5a8556377176c6e3ef1226aa968e714730
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Fri Jun 21 09:42:05 2019 +0100

    UPSTREAM: sched/uclamp: Add system default clamps

    Tasks without a user-defined clamp value are considered not clamped
    and by default their utilization can have any value in the
    [0..SCHED_CAPACITY_SCALE] range.

    Tasks with a user-defined clamp value are allowed to request any value
    in that range, and the required clamp is unconditionally enforced.
    However, a "System Management Software" could be interested in limiting
    the range of clamp values allowed for all tasks.

    Add a privileged interface to define a system default configuration via:

      /proc/sys/kernel/sched_uclamp_util_{min,max}

    which works as an unconditional clamp range restriction for all tasks.

    With the default configuration, the full SCHED_CAPACITY_SCALE range of
    values is allowed for each clamp index. Otherwise, the task-specific
    clamp is capped by the corresponding system default value.

    Do that by tracking, for each task, the "effective" clamp value and
    bucket the task has been refcounted in at enqueue time. This
    allows to lazy aggregate "requested" and "system default" values at
    enqueue time and simplifies refcounting updates at dequeue time.

    The cached bucket ids are used to avoid (relatively) more expensive
    integer divisions every time a task is enqueued.

    An active flag is used to report when the "effective" value is valid and
    thus the task is actually refcounted in the corresponding rq's bucket.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190621084217.8167-5-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit e8f14172c6b11e9a86c65532497087f8eb0f91b1)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I4f014c5ec9c312aaad606518f6e205fd0cfbcaa2
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 8bd52ebd588ad2c724fb789ee11b6e3179145eba
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Fri Jun 21 09:42:04 2019 +0100

    UPSTREAM: sched/uclamp: Enforce last task's UCLAMP_MAX

    When a task sleeps it removes its max utilization clamp from its CPU.
    However, the blocked utilization on that CPU can be higher than the max
    clamp value enforced while the task was running. This allows undesired
    CPU frequency increases while a CPU is idle, for example, when another
    CPU on the same frequency domain triggers a frequency update, since
    schedutil can now see the full not clamped blocked utilization of the
    idle CPU.

    Fix this by using:

      uclamp_rq_dec_id(p, rq, UCLAMP_MAX)
        uclamp_rq_max_value(rq, UCLAMP_MAX, clamp_value)

    to detect when a CPU has no more RUNNABLE clamped tasks and to flag this
    condition.

    Don't track any minimum utilization clamps since an idle CPU never
    requires a minimum frequency. The decay of the blocked utilization is
    good enough to reduce the CPU frequency.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190621084217.8167-4-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit e496187da71070687b55ff455e7d8d7d7f0ae0b9)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: Ie9eab897eb654ec9d4fba5eda20f66a91a712817
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit d3048c160f56d4003be0a9867b975140ea5efc0e
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Fri Jun 21 09:42:03 2019 +0100

    UPSTREAM: sched/uclamp: Add bucket local max tracking

    Because of bucketization, different task-specific clamp values are
    tracked in the same bucket.  For example, with 20% bucket size and
    assuming to have:

      Task1: util_min=25%
      Task2: util_min=35%

    both tasks will be refcounted in the [20..39]% bucket and always boosted
    only up to 20% thus implementing a simple floor aggregation normally
    used in histograms.

    In systems with only few and well-defined clamp values, it would be
    useful to track the exact clamp value required by a task whenever
    possible. For example, if a system requires only 23% and 47% boost
    values then it's possible to track the exact boost required by each
    task using only 3 buckets of ~33% size each.

    Introduce a mechanism to max aggregate the requested clamp values of
    RUNNABLE tasks in the same bucket. Keep it simple by resetting the
    bucket value to its base value only when a bucket becomes inactive.
    Allow a limited and controlled overboosting margin for tasks recounted
    in the same bucket.

    In systems where the boost values are not known in advance, it is still
    possible to control the maximum acceptable overboosting margin by tuning
    the number of clamp groups. For example, 20 groups ensure a 5% maximum
    overboost.

    Remove the rq bucket initialization code since a correct bucket value
    is now computed when a task is refcounted into a CPU's rq.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190621084217.8167-3-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 60daf9c19410604f08c99e146bc378c8a64f4ccd)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I8782971f8867033cee5aaf981c96f9de33a5288c
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 016fa3c1ea0a4a5bed08ff564ae45903ce3dfb9b
Author: Patrick Bellasi <patrick.bellasi@arm.com>
Date:   Fri Jun 21 09:42:02 2019 +0100

    BACKPORT: sched/uclamp: Add CPU's clamp buckets refcounting

    Utilization clamping allows to clamp the CPU's utilization within a
    [util_min, util_max] range, depending on the set of RUNNABLE tasks on
    that CPU. Each task references two "clamp buckets" defining its minimum
    and maximum (util_{min,max}) utilization "clamp values". A CPU's clamp
    bucket is active if there is at least one RUNNABLE tasks enqueued on
    that CPU and refcounting that bucket.

    When a task is {en,de}queued {on,from} a rq, the set of active clamp
    buckets on that CPU can change. If the set of active clamp buckets
    changes for a CPU a new "aggregated" clamp value is computed for that
    CPU. This is because each clamp bucket enforces a different utilization
    clamp value.

    Clamp values are always MAX aggregated for both util_min and util_max.
    This ensures that no task can affect the performance of other
    co-scheduled tasks which are more boosted (i.e. with higher util_min
    clamp) or less capped (i.e. with higher util_max clamp).

    A task has:
       task_struct::uclamp[clamp_id]::bucket_id
    to track the "bucket index" of the CPU's clamp bucket it refcounts while
    enqueued, for each clamp index (clamp_id).

    A runqueue has:
       rq::uclamp[clamp_id]::bucket[bucket_id].tasks
    to track how many RUNNABLE tasks on that CPU refcount each
    clamp bucket (bucket_id) of a clamp index (clamp_id).
    It also has a:
       rq::uclamp[clamp_id]::bucket[bucket_id].value
    to track the clamp value of each clamp bucket (bucket_id) of a clamp
    index (clamp_id).

    The rq::uclamp::bucket[clamp_id][] array is scanned every time it's
    needed to find a new MAX aggregated clamp value for a clamp_id. This
    operation is required only when it's dequeued the last task of a clamp
    bucket tracking the current MAX aggregated clamp value. In this case,
    the CPU is either entering IDLE or going to schedule a less boosted or
    more clamped task.
    The expected number of different clamp values configured at build time
    is small enough to fit the full unordered array into a single cache
    line, for configurations of up to 7 buckets.

    Add to struct rq the basic data structures required to refcount the
    number of RUNNABLE tasks for each clamp bucket. Add also the max
    aggregation required to update the rq's clamp value at each
    enqueue/dequeue event.

    Use a simple linear mapping of clamp values into clamp buckets.
    Pre-compute and cache bucket_id to avoid integer divisions at
    enqueue/dequeue time.

    Bug: 120440300
    Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Alessio Balsini <balsini@android.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Juri Lelli <juri.lelli@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
    Cc: Steve Muckle <smuckle@google.com>
    Cc: Suren Baghdasaryan <surenb@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Todd Kjos <tkjos@google.com>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Cc: Viresh Kumar <viresh.kumar@linaro.org>
    Link: https://lkml.kernel.org/r/20190621084217.8167-2-patrick.bellasi@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 69842cba9ace84849bb9b8edcdf2cefccd97901c)
    Signed-off-by: Qais Yousef <qais.yousef@arm.com>
    Change-Id: I2c2c23572fb82e004f815cc9c783881355df6836
    Signed-off-by: Quentin Perret <qperret@google.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit ff72a25515d4673ccdfe6a7a8606326725ee057f
Author: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>
Date:   Wed May 22 15:56:14 2024 +0800

    cpufreq: schedutil: Checkout to msm-4.19

    * From: https://github.com/EmanuelCN/kernel_xiaomi_sm8250/blob/staging/kernel/sched/cpufreq_schedutil.c
    * Removed sugov_get_util() under CONFIG_SCHED_WALT guard, because we don't use WALT anymore
    * Preserved 819f63f and c2bdfaf, also introduce new definitions from 4.19 for new schedutil

    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit eecd40ea2f0a9b22cd79f288da288e1d4513c086
Author: Viresh Kumar <viresh.kumar@linaro.org>
Date:   Tue May 22 15:31:30 2018 +0530

    cpufreq: Rename cpufreq_can_do_remote_dvfs()

    This routine checks if the CPU running this code belongs to the policy
    of the target CPU or if not, can it do remote DVFS for it remotely. But
    the current name of it implies as if it is only about doing remote
    updates.

    Rename it to make it more relevant.

    Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
    Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
    Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit c201be3d3447e286e7e6924dc06fd61fe923954b
Author: kondors1995 <normandija1945@gmail.com>
Date:   Mon Aug 5 10:10:39 2024 +0300

    Revert "cpufreq: Avoid leaving stale IRQ work items during CPU offline"

    This reverts commit c0079a7b3b1a3069ec86d4bb5d870edc9b292f99.

commit 69078f1d44b561a0eddceb43873b2ad5a772fe9d
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Wed Feb 6 17:14:22 2019 +0100

    sched/fair: Fix O(nr_cgroups) in the load balancing path

    commit 039ae8bcf7a5f4476f4487e6bf816885fb3fb617 upstream.

    This re-applies the commit reverted here:

      commit c40f7d74c741 ("sched/fair: Fix infinite loop in update_blocked_averages() by reverting a9e7f6544b9c")

    I.e. now that cfs_rq can be safely removed/added in the list, we can re-apply:

     commit a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path")

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: sargun@sargun.me
    Cc: tj@kernel.org
    Cc: xiexiuqi@huawei.com
    Cc: xiezhipeng1@huawei.com
    Link: https://lkml.kernel.org/r/1549469662-13614-3-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Cc: Vishnu Rangayyan <vishnu.rangayyan@apple.com>
    Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 178af989ed8209fc342dfd89bdf23b935f7a2ce1
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Wed Feb 6 17:14:21 2019 +0100

    sched/fair: Optimize update_blocked_averages()

    commit 31bc6aeaab1d1de8959b67edbed5c7a4b3cdbe7c upstream.

    Removing a cfs_rq from rq->leaf_cfs_rq_list can break the parent/child
    ordering of the list when it will be added back. In order to remove an
    empty and fully decayed cfs_rq, we must remove its children too, so they
    will be added back in the right order next time.

    With a normal decay of PELT, a parent will be empty and fully decayed
    if all children are empty and fully decayed too. In such a case, we just
    have to ensure that the whole branch will be added when a new task is
    enqueued. This is default behavior since :

      commit f6783319737f ("sched/fair: Fix insertion in rq->leaf_cfs_rq_list")

    In case of throttling, the PELT of throttled cfs_rq will not be updated
    whereas the parent will. This breaks the assumption made above unless we
    remove the children of a cfs_rq that is throttled. Then, they will be
    added back when unthrottled and a sched_entity will be enqueued.

    As throttled cfs_rq are now removed from the list, we can remove the
    associated test in update_blocked_averages().

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: sargun@sargun.me
    Cc: tj@kernel.org
    Cc: xiexiuqi@huawei.com
    Cc: xiezhipeng1@huawei.com
    Link: https://lkml.kernel.org/r/1549469662-13614-2-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Cc: Vishnu Rangayyan <vishnu.rangayyan@apple.com>
    Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 5f6df438eb114f68750cbaf73ff655cf09767aa9
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Wed Jan 30 06:22:47 2019 +0100

    sched/fair: Fix insertion in rq->leaf_cfs_rq_list

    commit f6783319737f28e4436a69611853a5a098cbe974 upstream.

    Sargun reported a crash:

      "I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix
       infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
       and put it on top of 4.19.13. In addition to this, I uninlined
       list_add_leaf_cfs_rq for debugging.

       This revealed a new bug that we didn't get to because we kept getting
       crashes from the previous issue. When we are running with cgroups that
       are rapidly changing, with CFS bandwidth control, and in addition
       using the cpusets cgroup, we see this crash. Specifically, it seems to
       occur with cgroups that are throttled and we change the allowed
       cpuset."

    The algorithm used to order cfs_rq in rq->leaf_cfs_rq_list assumes that
    it will walk down to root the 1st time a cfs_rq is used and we will finish
    to add either a cfs_rq without parent or a cfs_rq with a parent that is
    already on the list. But this is not always true in presence of throttling.
    Because a cfs_rq can be throttled even if it has never been used but other CPUs
    of the cgroup have already used all the bandwdith, we are not sure to go down to
    the root and add all cfs_rq in the list.

    Ensure that all cfs_rq will be added in the list even if they are throttled.

    [ mingo: Fix !CGROUPS build. ]

    Reported-by: Sargun Dhillon <sargun@sargun.me>
    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: tj@kernel.org
    Fixes: 9c2791f936ef ("Fix hierarchical order in rq->leaf_cfs_rq_list")
    Link: https://lkml.kernel.org/r/1548825767-10799-1-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Cc: Janne Huttunen <janne.huttunen@nokia.com>
    Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 43bf835d9112937ae2b7d07b2b0a64b53598623d
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Wed Jan 30 14:41:04 2019 +0100

    sched/fair: Add tmp_alone_branch assertion

    commit 5d299eabea5a251fbf66e8277704b874bbba92dc upstream.

    The magic in list_add_leaf_cfs_rq() requires that at the end of
    enqueue_task_fair():

      rq->tmp_alone_branch == &rq->lead_cfs_rq_list

    If this is violated, list integrity is compromised for list entries
    and the tmp_alone_branch pointer might dangle.

    Also, reflow list_add_leaf_cfs_rq() while there. This looses one
    indentation level and generates a form that's convenient for the next
    patch.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Cc: Janne Huttunen <janne.huttunen@nokia.com>
    Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 4a1c8a809ec62e802d9c2b174bcddd90750e6805
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Fri Jun 12 17:47:03 2020 +0200

    sched/pelt: Cleanup PELT divider

    Factorize in a single place the calculation of the divider to be used to
    to compute *_avg from *_sum value

    Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20200612154703.23555-1-vincent.guittot@linaro.org
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit ed092eafa66750326f15caa1a8e7c2f5f62f5951
Author: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date:   Wed Jun 3 10:03:01 2020 +0200

    sched/pelt: Remove redundant cap_scale() definition

    Besides in PELT cap_scale() is used in the Deadline scheduler class for
    scale-invariant bandwidth enforcement.
    Remove the cap_scale() definition in kernel/sched/pelt.c and keep the
    one in kernel/sched/sched.h.

    Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Link: https://lkml.kernel.org/r/20200603080304.16548-2-dietmar.eggemann@arm.com
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit a48e6b38b3b57cb00853bd405524f2158d454ba9
Author: Chengming Zhou <zhouchengming@bytedance.com>
Date:   Fri Apr 8 19:53:08 2022 +0800

    UPSTREAM: sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq

    Since commit 23127296889f ("sched/fair: Update scale invariance of PELT")
    change to use rq_clock_pelt() instead of rq_clock_task(), we should also
    use rq_clock_pelt() for throttled_clock_task_time and throttled_clock_task
    accounting to get correct cfs_rq_clock_pelt() of throttled cfs_rq. And
    rename throttled_clock_task(_time) to be clock_pelt rather than clock_task.

    Bug: 254441685
    Fixes: 23127296889f ("sched/fair: Update scale invariance of PELT")
    Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Reviewed-by: Ben Segall <bsegall@google.com>
    Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
    Link: https://lore.kernel.org/r/20220408115309.81603-1-zhouchengming@bytedance.com
    (cherry picked from commit 64eaf50731ac0a8c76ce2fedd50ef6652aabc5ff)
    Signed-off-by: Lee Jones <joneslee@google.com>
    Change-Id: I61e971d09f14708b8ee170fd5d5109144bba6e34
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 98290e08beea376902e4e3d2eaeb3851b0192f74
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Wed Jan 23 16:26:53 2019 +0100

    UPSTREAM: sched/fair: Update scale invariance of PELT

    The current implementation of load tracking invariance scales the
    contribution with current frequency and uarch performance (only for
    utilization) of the CPU. One main result of this formula is that the
    figures are capped by current capacity of CPU. Another one is that the
    load_avg is not invariant because not scaled with uarch.

    The util_avg of a periodic task that runs r time slots every p time slots
    varies in the range :

        U * (1-y^r)/(1-y^p) * y^i < Utilization < U * (1-y^r)/(1-y^p)

    with U is the max util_avg value = SCHED_CAPACITY_SCALE

    At a lower capacity, the range becomes:

        U * C * (1-y^r')/(1-y^p) * y^i' < Utilization <  U * C * (1-y^r')/(1-y^p)

    with C reflecting the compute capacity ratio between current capacity and
    max capacity.

    so C tries to compensate changes in (1-y^r') but it can't be accurate.

    Instead of scaling the contribution value of PELT algo, we should scale the
    running time. The PELT signal aims to track the amount of computation of
    tasks and/or rq so it seems more correct to scale the running time to
    reflect the effective amount of computation done since the last update.

    In order to be fully invariant, we need to apply the same amount of
    running time and idle time whatever the current capacity. Because running
    at lower capacity implies that the task will run longer, we have to ensure
    that the same amount of idle time will be applied when system becomes idle
    and no idle time has been "stolen". But reaching the maximum utilization
    value (SCHED_CAPACITY_SCALE) means that the task is seen as an
    always-running task whatever the capacity of the CPU (even at max compute
    capacity). In this case, we can discard this "stolen" idle times which
    becomes meaningless.

    In order to achieve this time scaling, a new clock_pelt is created per rq.
    The increase of this clock scales with current capacity when something
    is running on rq and synchronizes with clock_task when rq is idle. With
    this mechanism, we ensure the same running and idle time whatever the
    current capacity. This also enables to simplify the pelt algorithm by
    removing all references of uarch and frequency and applying the same
    contribution to utilization and loads. Furthermore, the scaling is done
    only once per update of clock (update_rq_clock_task()) instead of during
    each update of sched_entities and cfs/rt/dl_rq of the rq like the current
    implementation. This is interesting when cgroup are involved as shown in
    the results below:

    On a hikey (octo Arm64 platform).
    Performance cpufreq governor and only shallowest c-state to remove variance
    generated by those power features so we only track the impact of pelt algo.

    each test runs 16 times:

    	./perf bench sched pipe
    	(higher is better)
    	kernel	tip/sched/core     + patch
    	        ops/seconds        ops/seconds         diff
    	cgroup
    	root    59652(+/- 0.18%)   59876(+/- 0.24%)    +0.38%
    	level1  55608(+/- 0.27%)   55923(+/- 0.24%)    +0.57%
    	level2  52115(+/- 0.29%)   52564(+/- 0.22%)    +0.86%

    	hackbench -l 1000
    	(lower is better)
    	kernel	tip/sched/core     + patch
    	        duration(sec)      duration(sec)        diff
    	cgroup
    	root    4.453(+/- 2.37%)   4.383(+/- 2.88%)     -1.57%
    	level1  4.859(+/- 8.50%)   4.830(+/- 7.07%)     -0.60%
    	level2  5.063(+/- 9.83%)   4.928(+/- 9.66%)     -2.66%

    Then, the responsiveness of PELT is improved when CPU is not running at max
    capacity with this new algorithm. I have put below some examples of
    duration to reach some typical load values according to the capacity of the
    CPU with current implementation and with this patch. These values has been
    computed based on the geometric series and the half period value:

      Util (%)     max capacity  half capacity(mainline)  half capacity(w/ patch)
      972 (95%)    138ms         not reachable            276ms
      486 (47.5%)  30ms          138ms                     60ms
      256 (25%)    13ms           32ms                     26ms

    On my hikey (octo Arm64 platform) with schedutil governor, the time to
    reach max OPP when starting from a null utilization, decreases from 223ms
    with current scale invariance down to 121ms with the new algorithm.

    Bug: 120440300
    Change-Id: I0bd4ed2317f2a9a965634e53ce1476417af697a6
    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Morten.Rasmussen@arm.com
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: bsegall@google.com
    Cc: dietmar.eggemann@arm.com
    Cc: patrick.bellasi@arm.com
    Cc: pjt@google.com
    Cc: pkondeti@codeaurora.org
    Cc: quentin.perret@arm.com
    Cc: rjw@rjwysocki.net
    Cc: srinivas.pandruvada@linux.intel.com
    Cc: thara.gopinath@linaro.org
    Link: https://lkml.kernel.org/r/1548257214-13745-3-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 23127296889fe84b0762b191b5d041e8ba6f2599)
    Signed-off-by: Quentin Perret <quentin.perret@arm.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 0a7060dfca03fe7a078974f343372ad623e7a25d
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Wed Jan 23 16:26:52 2019 +0100

    UPSTREAM: sched/fair: Move the rq_of() helper function

    Move rq_of() helper function so it can be used in pelt.c

    [ mingo: Improve readability while at it. ]

    Bug: 120440300
    Change-Id: I2133979476631d68baaffcaa308f4cdab94f22b1
    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Morten.Rasmussen@arm.com
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: bsegall@google.com
    Cc: dietmar.eggemann@arm.com
    Cc: patrick.bellasi@arm.com
    Cc: pjt@google.com
    Cc: pkondeti@codeaurora.org
    Cc: quentin.perret@arm.com
    Cc: rjw@rjwysocki.net
    Cc: srinivas.pandruvada@linux.intel.com
    Cc: thara.gopinath@linaro.org
    Link: https://lkml.kernel.org/r/1548257214-13745-2-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 62478d9911fab9694c195f0ca8e4701de09be98e)
    Signed-off-by: Quentin Perret <quentin.perret@arm.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 4ed482487f9d5ad5d11417da5376c596ffc2956c
Author: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date:   Fri Aug 3 15:05:38 2018 +0100

    UPSTREAM: sched/fair: Remove setting task's se->runnable_weight during PELT update

    A CFS (SCHED_OTHER, SCHED_BATCH or SCHED_IDLE policy) task's
    se->runnable_weight must always be in sync with its se->load.weight.

    se->runnable_weight is set to se->load.weight when the task is
    forked (init_entity_runnable_average()) or reniced (reweight_entity()).

    There are two cases in set_load_weight() which since they currently only
    set se->load.weight could lead to a situation in which se->load.weight
    is different to se->runnable_weight for a CFS task:

    (1) A task switches to SCHED_IDLE.

    (2) A SCHED_FIFO, SCHED_RR or SCHED_DEADLINE task which has been reniced
        (during which only its static priority gets set) switches to
        SCHED_OTHER or SCHED_BATCH.

    Set se->runnable_weight to se->load.weight in these two cases to prevent
    this. This eliminates the need to explicitly set it to se->load.weight
    during PELT updates in the CFS scheduler fastpath.

    Bug: 120440300
    Change-Id: I52184a9e1fd53cb42ef3ae546b1fae78b744c9ad
    Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Joel Fernandes <joelaf@google.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Patrick Bellasi <patrick.bellasi@arm.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Quentin Perret <quentin.perret@arm.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Vincent Guittot <vincent.guittot@linaro.org>
    Link: http://lkml.kernel.org/r/20180803140538.1178-1-dietmar.eggemann@arm.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    (cherry picked from commit 4a465e3ebbc8004ce4f7f08f6022ee8315a94edf)
    Signed-off-by: Quentin Perret <quentin.perret@arm.com>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit a97016777175a6305b14bbae29f4ce0a3a3930cd
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Fri Dec 14 23:10:06 2018 +0100

    sched/pelt: Fix warning and clean up IRQ PELT config

    Commit 11d4afd4ff667f9b6178ee8c142c36cb78bd84db upstream.

    Create a config for enabling irq load tracking in the scheduler.
    irq load tracking is useful only when irq or paravirtual time is
    accounted but it's only possible with SMP for now.

    Also use __maybe_unused to remove the compilation warning in
    update_rq_clock_task() that has been introduced by:

      2e62c4743adc ("sched/fair: Remove #ifdefs from scale_rt_capacity()")

    Suggested-by: Ingo Molnar <mingo@redhat.com>
    Reported-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
    Reported-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: bp@alien8.de
    Cc: dou_liyang@163.com
    Fixes: 2e62c4743adc ("sched/fair: Remove #ifdefs from scale_rt_capacity()")
    Link: http://lkml.kernel.org/r/1537867062-27285-1-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Sasha Levin <sashal@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 8c3f9eb95d52cb68c7f62e2cfcf99a662008e4f7
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Thu Jul 19 14:00:06 2018 +0200

    sched/fair: Remove #ifdefs from scale_rt_capacity()

    Reuse cpu_util_irq() that has been defined for schedutil and set irq util
    to 0 when !CONFIG_IRQ_TIME_ACCOUNTING.

    But the compiler is not able to optimize the sequence (at least with
    aarch64 GCC 7.2.1):

    	free *= (max - irq);
    	free /= max;

    when irq is fixed to 0

    Add a new inline function scale_irq_capacity() that will scale utilization
    when irq is accounted. Reuse this funciton in schedutil which applies
    similar formula.

    Suggested-by: Ingo Molnar <mingo@redhat.com>
    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: rjw@rjwysocki.net
    Link: http://lkml.kernel.org/r/1532001606-6689-1-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit bf59401f1068438e811e2bb93f5b2ac3c88f1b52
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Fri Aug 31 17:22:55 2018 +0200

    sched/pelt: Fix update_blocked_averages() for RT and DL classes

    update_blocked_averages() is called to periodiccally decay the stalled load
    of idle CPUs and to sync all loads before running load balance.

    When cfs rq is idle, it trigs a load balance during pick_next_task_fair()
    in order to potentially pull tasks and to use this newly idle CPU. This
    load balance happens whereas prev task from another class has not been put
    and its utilization updated yet. This may lead to wrongly account running
    time as idle time for RT or DL classes.

    Test that no RT or DL task is running when updating their utilization in
    update_blocked_averages().

    We still update RT and DL utilization instead of simply skipping them to
    make sure that all metrics are synced when used during load balance.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Fixes: 371bf4273269 ("sched/rt: Add rt_rq utilization tracking")
    Fixes: 3727e0e16340 ("sched/dl: Add dl_rq utilization tracking")
    Link: http://lkml.kernel.org/r/1535728975-22799-1-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit cc819e146c895dd29b3fe93fbd558ccd0a190ed6
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Thu Jun 28 17:45:12 2018 +0200

    sched/core: Use PELT for scale_rt_capacity()

    The utilization of the CPU by RT, DL and IRQs are now tracked with
    PELT so we can use these metrics instead of rt_avg to evaluate the remaining
    capacity available for CFS class.

    scale_rt_capacity() behavior has been changed and now returns the remaining
    capacity available for CFS instead of a scaling factor because RT, DL and
    IRQ provide now absolute utilization value.

    The same formula as schedutil is used:

      IRQ util_avg + (1 - IRQ util_avg / max capacity ) * /Sum rq util_avg

    but the implementation is different because it doesn't return the same value
    and doesn't benefit of the same optimization.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten.Rasmussen@arm.com
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: claudio@evidence.eu.com
    Cc: daniel.lezcano@linaro.org
    Cc: dietmar.eggemann@arm.com
    Cc: joel@joelfernandes.org
    Cc: juri.lelli@redhat.com
    Cc: luca.abeni@santannapisa.it
    Cc: patrick.bellasi@arm.com
    Cc: quentin.perret@arm.com
    Cc: rjw@rjwysocki.net
    Cc: valentin.schneider@arm.com
    Cc: viresh.kumar@linaro.org
    Link: http://lkml.kernel.org/r/1530200714-4504-10-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 0f6598d6f0def162745d059a67d1148cf42cfaa3
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Thu Jun 28 17:45:09 2018 +0200

    sched/irq: Add IRQ utilization tracking

    interrupt and steal time are the only remaining activities tracked by
    rt_avg. Like for sched classes, we can use PELT to track their average
    utilization of the CPU. But unlike sched class, we don't track when
    entering/leaving interrupt; Instead, we take into account the time spent
    under interrupt context when we update rqs' clock (rq_clock_task).
    This also means that we have to decay the normal context time and account
    for interrupt time during the update.

    That's also important to note that because:

      rq_clock == rq_clock_task + interrupt time

    and rq_clock_task is used by a sched class to compute its utilization, the
    util_avg of a sched class only reflects the utilization of the time spent
    in normal context and not of the whole time of the CPU. The utilization of
    interrupt gives an more accurate level of utilization of CPU.

    The CPU utilization is:

      avg_irq + (1 - avg_irq / max capacity) * /Sum avg_rq

    Most of the time, avg_irq is small and neglictible so the use of the
    approximation CPU utilization = /Sum avg_rq was enough.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten.Rasmussen@arm.com
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: claudio@evidence.eu.com
    Cc: daniel.lezcano@linaro.org
    Cc: dietmar.eggemann@arm.com
    Cc: joel@joelfernandes.org
    Cc: juri.lelli@redhat.com
    Cc: luca.abeni@santannapisa.it
    Cc: patrick.bellasi@arm.com
    Cc: quentin.perret@arm.com
    Cc: rjw@rjwysocki.net
    Cc: valentin.schneider@arm.com
    Cc: viresh.kumar@linaro.org
    Link: http://lkml.kernel.org/r/1530200714-4504-7-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit c616a7241810a508cb961a6e1658748113161221
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Thu Jun 28 17:45:07 2018 +0200

    sched/dl: Add dl_rq utilization tracking

    Similarly to what happens with RT tasks, CFS tasks can be preempted by DL
    tasks and the CFS's utilization might no longer describes the real
    utilization level.

    Current DL bandwidth reflects the requirements to meet deadline when tasks are
    enqueued but not the current utilization of the DL sched class. We track
    DL class utilization to estimate the system utilization.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten.Rasmussen@arm.com
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: claudio@evidence.eu.com
    Cc: daniel.lezcano@linaro.org
    Cc: dietmar.eggemann@arm.com
    Cc: joel@joelfernandes.org
    Cc: juri.lelli@redhat.com
    Cc: luca.abeni@santannapisa.it
    Cc: patrick.bellasi@arm.com
    Cc: quentin.perret@arm.com
    Cc: rjw@rjwysocki.net
    Cc: valentin.schneider@arm.com
    Cc: viresh.kumar@linaro.org
    Link: http://lkml.kernel.org/r/1530200714-4504-5-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit abc5cabb9b98b262baad2250bbafbe54d8f2fc3e
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Thu Jun 28 17:45:05 2018 +0200

    sched/rt: Add rt_rq utilization tracking

    schedutil governor relies on cfs_rq's util_avg to choose the OPP when CFS
    tasks are running. When the CPU is overloaded by CFS and RT tasks, CFS tasks
    are preempted by RT tasks and in this case util_avg reflects the remaining
    capacity but not what CFS want to use. In such case, schedutil can select a
    lower OPP whereas the CPU is overloaded. In order to have a more accurate
    view of the utilization of the CPU, we track the utilization of RT tasks.
    Only util_avg is correctly tracked but not load_avg and runnable_load_avg
    which are useless for rt_rq.

    rt_rq uses rq_clock_task and cfs_rq uses cfs_rq_clock_task but they are
    the same at the root group level, so the PELT windows of the util_sum are
    aligned.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten.Rasmussen@arm.com
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: claudio@evidence.eu.com
    Cc: daniel.lezcano@linaro.org
    Cc: dietmar.eggemann@arm.com
    Cc: joel@joelfernandes.org
    Cc: juri.lelli@redhat.com
    Cc: luca.abeni@santannapisa.it
    Cc: patrick.bellasi@arm.com
    Cc: quentin.perret@arm.com
    Cc: rjw@rjwysocki.net
    Cc: valentin.schneider@arm.com
    Cc: viresh.kumar@linaro.org
    Link: http://lkml.kernel.org/r/1530200714-4504-3-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 7ce46e2c80931a4da93dbe7674998fee851c8f1f
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Thu Jun 28 17:45:04 2018 +0200

    sched/pelt: Move PELT related code in a dedicated file

    We want to track rt_rq's utilization as a part of the estimation of the
    whole rq's utilization. This is necessary because rt tasks can steal
    utilization to cfs tasks and make them lighter than they are.
    As we want to use the same load tracking mecanism for both and prevent
    useless dependency between cfs and rt code, PELT code is moved in a
    dedicated file.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Morten.Rasmussen@arm.com
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: claudio@evidence.eu.com
    Cc: daniel.lezcano@linaro.org
    Cc: dietmar.eggemann@arm.com
    Cc: joel@joelfernandes.org
    Cc: juri.lelli@redhat.com
    Cc: luca.abeni@santannapisa.it
    Cc: patrick.bellasi@arm.com
    Cc: quentin.perret@arm.com
    Cc: rjw@rjwysocki.net
    Cc: valentin.schneider@arm.com
    Cc: viresh.kumar@linaro.org
    Link: http://lkml.kernel.org/r/1530200714-4504-2-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit a7703d111f4372d3eba166af9ebd9092ff3dc66b
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Thu Nov 16 15:21:52 2017 +0100

    sched/fair: Update and fix the runnable propagation rule

    Unlike running, the runnable part can't be directly propagated through
    the hierarchy when we migrate a task. The main reason is that runnable
    time can be shared with other sched_entities that stay on the rq and
    this runnable time will also remain on prev cfs_rq and must not be
    removed.

    Instead, we can estimate what should be the new runnable of the prev
    cfs_rq and check that this estimation stay in a possible range. The
    prop_runnable_sum is a good estimation when adding runnable_sum but
    fails most often when we remove it. Instead, we could use the formula
    below instead:

      gcfs_rq's runnable_sum = gcfs_rq->avg.load_sum / gcfs_rq->load.weight

    which assumes that tasks are equally runnable which is not true but
    easy to compute.

    Beside these estimates, we have several simple rules that help us to filter
    out wrong ones:

     - ge->avg.runnable_sum <= than LOAD_AVG_MAX
     - ge->avg.runnable_sum >= ge->avg.running_sum (ge->avg.util_sum << LOAD_AVG_MAX)
     - ge->avg.runnable_sum can't increase when we detach a task

    The effect of these fixes is better cgroups balancing.

    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Ben Segall <bsegall@google.com>
    Cc: Chris Mason <clm@fb.com>
    Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Cc: Josef Bacik <josef@toxicpanda.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Mike Galbraith <efault@gmx.de>
    Cc: Morten Rasmussen <morten.rasmussen@arm.com>
    Cc: Paul Turner <pjt@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Tejun Heo <tj@kernel.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Yuyang Du <yuyang.du@intel.com>
    Link: http://lkml.kernel.org/r/1510842112-21028-1-git-send-email-vincent.guittot@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 480aacf775e2ec594e0fed31b15678ef967f29fb
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Thu Aug 24 13:06:35 2017 +0200

    sched/fair: Update calc_group_*() comments

    I had a wee bit of trouble recalling how the calc_group_runnable()
    stuff worked.. add hopefully better comments.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 5b49acfeddbd52712395f011c4ab6e4695863738
Author: Josef Bacik <jbacik@fb.com>
Date:   Thu Aug 3 11:13:39 2017 -0400

    sched/fair: Calculate runnable_weight slightly differently

    Our runnable_weight currently looks like this

    runnable_weight = shares * runnable_load_avg / load_avg

    The goal is to scale the runnable weight for the group based on its runnable to
    load_avg ratio.  The problem with this is it biases us towards tasks that never
    go to sleep.  Tasks that go to sleep are going to have their runnable_load_avg
    decayed pretty hard, which will drastically reduce the runnable weight of groups
    with interactive tasks.  To solve this imbalance we tweak this slightly, so in
    the ideal case it is still the above, but in the interactive case it is

    runnable_weight = shares * runnable_weight / load_weight

    which will make the weight distribution fairer between interactive and
    non-interactive groups.

    Signed-off-by: Josef Bacik <jbacik@fb.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: kernel-team@fb.com
    Cc: linux-kernel@vger.kernel.org
    Cc: riel@redhat.com
    Cc: tj@kernel.org
    Link: http://lkml.kernel.org/r/1501773219-18774-2-git-send-email-jbacik@fb.com
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 7be10d3b02e103159be137da4bc8fd9ebfda2b81
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Fri May 12 14:18:10 2017 +0200

    sched/fair: Implement more accurate async detach

    The problem with the overestimate is that it will subtract too big a
    value from the load_sum, thereby pushing it down further than it ought
    to go. Since runnable_load_avg is not subject to a similar 'force',
    this results in the occasional 'runnable_load > load' situation.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit eebab46a1ff61d392a032456a0274185ec09c78d
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Fri May 12 14:16:30 2017 +0200

    sched/fair: Align PELT windows between cfs_rq and its se

    The PELT _sum values are a saw-tooth function, dropping on the decay
    edge and then growing back up again during the window.

    When these window-edges are not aligned between cfs_rq and se, we can
    have the situation where, for example, on dequeue, the se decays
    first.

    Its _sum values will be small(er), while the cfs_rq _sum values will
    still be on their way up. Because of this, the subtraction:
    cfs_rq->avg._sum -= se->avg._sum will result in a positive value. This
    will then, once the cfs_rq reaches an edge, translate into its _avg
    value jumping up.

    This is especially visible with the runnable_load bits, since they get
    added/subtracted a lot.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit dd849181691e6b895696b331c9a750139a1bffd4
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Thu May 11 17:57:24 2017 +0200

    sched/fair: Implement synchonous PELT detach on load-balance migrate

    Vincent wondered why his self migrating task had a roughly 50% dip in
    load_avg when landing on the new CPU. This is because we uncondionally
    take the asynchronous detatch_entity route, which can lead to the
    attach on the new CPU still seeing the old CPU's contribution to
    tg->load_avg, effectively halving the new CPU's shares.

    While in general this is something we have to live with, there is the
    special case of runnable migration where we can do better.

    Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit fa4e5dee6619a0a3c5368e25147263b8576318e4
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Sat May 6 15:59:54 2017 +0200

    sched/fair: Propagate an effective runnable_load_avg

    The load balancer uses runnable_load_avg as load indicator. For
    !cgroup this is:

      runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq

    That is, a direct sum of all runnable tasks on that runqueue. As
    opposed to load_avg, which is a sum of all tasks on the runqueue,
    which includes a blocked component.

    However, in the cgroup case, this comes apart since the group entities
    are always runnable, even if most of their constituent entities are
    blocked.

    Therefore introduce a runnable_weight which for task entities is the
    same as the regular weight, but for group entities is a fraction of
    the entity weight and represents the runnable part of the group
    runqueue.

    Then propagate this load through the PELT hierarchy to arrive at an
    effective runnable load avgerage -- which we should not confuse with
    the canonical runnable load average.

    Suggested-by: Tejun Heo <tj@kernel.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit c83be534eabf6400d82a999eb1072a5e7ecf2eb2
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Mon May 8 17:30:46 2017 +0200

    sched/fair: Rewrite PELT migration propagation

    When an entity migrates in (or out) of a runqueue, we need to add (or
    remove) its contribution from the entire PELT hierarchy, because even
    non-runnable entities are included in the load average sums.

    In order to do this we have some propagation logic that updates the
    PELT tree, however the way it 'propagates' the runnable (or load)
    change is (more or less):

                         tg->weight * grq->avg.load_avg
      ge->avg.load_avg = ------------------------------
                                   tg->load_avg

    But that is the expression for ge->weight, and per the definition of
    load_avg:

      ge->avg.load_avg := ge->weight * ge->avg.runnable_avg

    That destroys the runnable_avg (by setting it to 1) we wanted to
    propagate.

    Instead directly propagate runnable_sum.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 067555d1c451e393e04adf8056c17505de978ac1
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Mon May 8 16:51:41 2017 +0200

    sched/fair: Rewrite cfs_rq->removed_*avg

    Since on wakeup migration we don't hold the rq->lock for the old CPU
    we cannot update its state. Instead we add the removed 'load' to an
    atomic variable and have the next update on that CPU collect and
    process it.

    Currently we have 2 atomic variables; which already have the issue
    that they can be read out-of-sync. Also, two atomic ops on a single
    cacheline is already more expensive than an uncontended lock.

    Since we want to add more, convert the thing over to an explicit
    cacheline with a lock in.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 938acabba842513db34a25ace967f7771998ee29
Author: Vincent Guittot <vincent.guittot@linaro.org>
Date:   Wed May 17 11:50:45 2017 +0200

    sched/fair: Use reweight_entity() for set_user_nice()

    Now that we directly change load_avg and propagate that change into
    the sums, sys_nice() and co should do the same, otherwise its possible
    to confuse load accounting when we migrate near the weight change.

    Fixes-by: Josef Bacik <josef@toxicpanda.com>
    Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
    [ Added changelog, fixed the call condition. ]
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Link: http://lkml.kernel.org/r/20170517095045.GA8420@linaro.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit c1d25beea75097359c3d0300330e2bba0106cff0
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Sat May 6 16:11:34 2017 +0200

    sched/fair: More accurate reweight_entity()

    When a (group) entity changes it's weight we should instantly change
    its load_avg and propagate that change into the sums it is part of.
    Because we use these values to predict future behaviour and are not
    interested in its historical value.

    Without this change, the change in load would need to propagate
    through the average, by which time it could again have changed etc..
    always chasing itself.

    With this change, the cfs_rq load_avg sum will more accurately reflect
    the current runnable and expected return of blocked load.

    Reported-by: Paul Turner <pjt@google.com>
    [josef: compile fix !SMP || !FAIR_GROUP]
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 3092e27836e4e71089267a27e5a9ab4100627618
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Thu Aug 24 17:45:35 2017 +0200

    sched/fair: Introduce {en,de}queue_load_avg()

    Analogous to the existing {en,de}queue_runnable_load_avg() add helpers
    for {en,de}queue_load_avg(). More users will follow.

    Includes some code movement to avoid fwd declarations.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 2d9eb6d1ce196555a3571506a0566791f53da0d4
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Thu Aug 24 17:38:30 2017 +0200

    sched/fair: Rename {en,de}queue_entity_load_avg()

    Since they're now purely about runnable_load, rename them.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit beda9e28dbb0eb34d787d4cbac2bc8399fa6f1a6
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Sat May 6 17:37:03 2017 +0200

    sched/fair: Move enqueue migrate handling

    Move the entity migrate handling from enqueue_entity_load_avg() to
    update_load_avg(). This has two benefits:

     - {en,de}queue_entity_load_avg() will become purely about managing
       runnable_load

     - we can avoid a double update_tg_load_avg() and reduce pressure on
       the global tg->shares cacheline

    The reason we do this is so that we can change update_cfs_shares() to
    change both weight and (future) runnable_weight. For this to work we
    need to have the cfs_rq averages up-to-date (which means having done
    the attach), but we need the cfs_rq->avg.runnable_avg to not yet
    include the se's contribution (since se->on_rq == 0).

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 94f87f43b19ceeecc31c9e7169533ad64efeb085
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Sat May 6 17:32:43 2017 +0200

    sched/fair: Change update_load_avg() arguments

    Most call sites of update_load_avg() already have cfs_rq_of(se)
    available, pass it down instead of recomputing it.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit f1bfd1f0c849264e58ce1415ddea244440a92797
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Sat May 6 16:42:08 2017 +0200

    sched/fair: Remove se->load.weight from se->avg.load_sum

    Remove the load from the load_sum for sched_entities, basically
    turning load_sum into runnable_sum.  This prepares for better
    reweighting of group entities.

    Since we now have different rules for computing load_avg, split
    ___update_load_avg() into two parts, ___update_load_sum() and
    ___update_load_avg().

    So for se:

      ___update_load_sum(.weight = 1)
      ___upate_load_avg(.weight = se->load.weight)

    and for cfs_rq:

      ___update_load_sum(.weight = cfs_rq->load.weight)
      ___upate_load_avg(.weight = 1)

    Since the primary consumable is load_avg, most things will not be
    affected. Only those few sites that initialize/modify load_sum need
    attention.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 9600274a37f34758ce3dfda9f5257b7737f298bb
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Thu May 11 18:16:06 2017 +0200

    sched/fair: Cure calc_cfs_shares() vs. reweight_entity()

    Vincent reported that when running in a cgroup, his root
    cfs_rq->avg.load_avg dropped to 0 on task idle.

    This is because reweight_entity() will now immediately propagate the
    weight change of the group entity to its cfs_rq, and as it happens,
    our approxmation (5) for calc_cfs_shares() results in 0 when the group
    is idle.

    Avoid this by using the correct (3) as a lower bound on (5). This way
    the empty cgroup will slowly decay instead of instantly drop to 0.

    Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 664ab7a3e73cbb37c84125925f2cbbc91ed7fdca
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Tue May 9 11:04:07 2017 +0200

    sched/fair: Add comment to calc_cfs_shares()

    Explain the magic equation in calc_cfs_shares() a bit better.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit 8cdb20cedab8458aafc8d4766ecb08a28d6ad076
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Sat May 6 16:03:17 2017 +0200

    sched/fair: Clean up calc_cfs_shares()

    For consistencies sake, we should have only a single reading of tg->shares.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>

commit e4f0f21e76d364601055d5d1f14eae828fd4383a
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 12:00:36 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Add CPU's clamp buckets refcounting"

    This reverts commit 116cf381f43abb961d1c7276c9be64b03a91cbfc.

commit 8a408ac031b6990f1a2d538680ffac88e42e477c
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 12:00:36 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Add bucket local max tracking"

    This reverts commit a684fdc2dec4430a1c96b34360a7eccee2dc16d1.

commit 6f2949c1073b97efb28f05d086fdc9e473d421c4
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 12:00:35 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Enforce last task's UCLAMP_MAX"

    This reverts commit 41ce2236860d4f0e9c7e9f7a3659080c19bbdd66.

commit 86d7baea30401efcc1d609c619c1526a952e6419
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 12:00:35 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Add system default clamps"

    This reverts commit 5bee2de619c41ffb7ba39cb6df3db0b2ed041c08.

commit 9d758d18c03983c60e0aba8bfd98ca0084e7d7c4
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 12:00:33 2024 +0300

    Revert "UPSTREAM: sched/core: Allow sched_setattr() to use the current policy"

    This reverts commit 38b1e6723b4accdd747cc75ab954f337559efad9.

commit 164913a4e0ab61a201d7802eb0de81099453971a
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 12:00:20 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Extend sched_setattr() to support utilization clamping"

    This reverts commit 486d86679747400e8b0cf39e2b0d35562f9a9989.

commit 98262c0640ec825cc93c5faf076c7d24ae8ae6a7
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:57 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Reset uclamp values on RESET_ON_FORK"

    This reverts commit 13caaea80c308cbc2600644ecd2cbca95f5e5095.

commit 17a9d13f5c3bc718f83c71b7b18b40a6525f22c0
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:57 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Set default clamps for RT tasks"

    This reverts commit 1e06d5830af9e0952a8e311786e6b11b0deb492a.

commit 65d7db99c966d5c853da01eb8897be026f5674ed
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:57 2024 +0300

    Revert "BACKPORT: sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks"

    This reverts commit 1f42ab6cc15a3611c4015feca3787d78d07c56d7.

commit 318ea0855d8084332eb060ca8948bb970cf08129
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:57 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Add uclamp_util_with()"

    This reverts commit e6a7b88cc257c9318d5a6aa2ad1116bcb4be0770.

commit f1ec7c15ea378c0fa4089a36d753e99054265c10
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:57 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Extend CPU's cgroup controller"

    This reverts commit 6a413999607b529bf0e33ce12ca1df84302e9f14.

commit 8814a297260c377a08f12350ab47f302d333043b
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:56 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Propagate parent clamps"

    This reverts commit 4963c9685e76a78b8cdbb8cc9843970f607e6c32.

commit fe9d3b66e407939f83c8b9ee26b9318ec4316609
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:56 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Propagate system defaults to the root group"

    This reverts commit a3bbe484012e0e1343e1be875f3abca40d79cd04.

commit f6256062570adc1d7e261ffff855f0fa1128d8b1
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:56 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Use TG's clamps to restrict TASK's clamps"

    This reverts commit dbb3b287e2dd3c012bce9e90e5f2e5007ef8d5de.

commit 21d5ea4fd377bedd6ad5b49a829fa7b3867d9bd6
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:56 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Update CPU's refcount on TG's clamp changes"

    This reverts commit b5c75307c4f7017a3a340514917bb28cd7d0acae.

commit 2d9b5a47c47a33c965f744ce34c8924524558bae
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:56 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Always use 'enum uclamp_id' for clamp_id values"

    This reverts commit a479e1de9bc60243dcafea50d9a52bf784e1ba0f.

commit 962708c53ab41d17d9233543581553226f6da382
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:51 2024 +0300

    Revert "UPSTREAM: sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code"

    This reverts commit fe18af67327ccfb1fe1f49fbf85884e18cf39a58.

commit f1bb535dbf77aec377d06a795a52c3e1c3489ab9
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:19 2024 +0300

    Revert "UPSTREAM: sched/core: Fix compilation error when cgroup not selected"

    This reverts commit 4f89a28e12c87ebcea8bc11d44e60c54f1c38021.

commit 7e33f0d51debf2dd24a33ea7fbb8ac19edb42b1e
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:19 2024 +0300

    Revert "sched/uclamp: Reject negative values in cpu_uclamp_write()"

    This reverts commit a12fbe805e840557b28f97ecce6c4749be7a5daa.

commit 546d51a84878a1046177b739ccdfbfe4748818b4
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:19 2024 +0300

    Revert "sched/core: Fix size of rq::uclamp initialization"

    This reverts commit c7be60603f26b5ad2ac973670d2ca99e1586ea48.

commit f8456cecf7592fd67d02f5250579d5332a3e168d
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:19 2024 +0300

    Revert "sched/core: Fix reset-on-fork from RT with uclamp"

    This reverts commit fbeca2f3560cb9601c015b4b7e2986dfd59e6113.

commit 7f497bb9e5d2be6a0182e30b1b9cd6feef3b70ab
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:19 2024 +0300

    Revert "BACKPORT: sched/uclamp: Remove uclamp_util()"

    This reverts commit b6f58cb73defb3e7c62c4ef73caa1f6913b9d48c.

commit 1573bacd6d6455ddcf9aa62e6ed71080ee8fe73d
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:19 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Make uclamp util helpers use and return UL values"

    This reverts commit 1fbeb27caf01456e489b3c14dbdbd9d50816edd5.

commit 33c125b0a8d140d9f6333d1334784b6eaae89009
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:18 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Fix initialization of struct uclamp_rq"

    This reverts commit 294a7066186224c38540adb652e852762a5e60cc.

commit fd06089869bde9880e4953e1f43f288db2fc5b14
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:18 2024 +0300

    Revert "BACKPORT: sched/uclamp: Protect uclamp fast path code with static key"

    This reverts commit b2f4e5d8c3f7684796b442a2cecfecd93f25a675.

commit 592e4a8ed2c0a59fac12c39b2a023c3517c7c801
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:18 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Fix a deadlock when enabling uclamp static key"

    This reverts commit 3eb05145c9dbdc5cb046ae836b330a3f4a2df427.

commit 60dc9ca3b6caf6db7cb3b209527111812b78587b
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:59:13 2024 +0300

    Revert "BACKPORT: sched/uclamp: Add a new sysctl to control RT default boost value"

    This reverts commit 1abcbcbea7d4c646092ba9a2418cf9fe83121a6b.

commit 72296d4745d8b53ccedd64edfaa455ca6c8fae9c
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:58:42 2024 +0300

    Revert "ANDROID: sched/core: Add a latency-sensitive flag to uclamp"

    This reverts commit b5fa516b3aa41f6095b4badca38273f064f434ea.

commit 9b478422f4676d419918afb41a71015a02f79a84
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:58:42 2024 +0300

    Revert "ANDROID: sched: Introduce uclamp latency and boost wrapper"

    This reverts commit fe73bc33dfdeb0c59b9ef177af505402a09b4fd7.

commit 2de1a0534caaa6fa2255b9eda5d64fb2b9daa209
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:58:42 2024 +0300

    Revert "sched/fair: Modify boosted_task_util() to reflect uclamp changes"

    This reverts commit 8c84f3cd7fa1ef844aca19ceb2ad11e89750af27.

commit 523dab791c4ebd52b8b6473d52967edeb7bdbfd8
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:58:36 2024 +0300

    Revert "Revert "sched/fair: Revert Google's capacity margin hacks""

    This reverts commit b623172b47e4655c5eab885f4eb8517ba2392928.

commit 43bd6d52219dcca4cdbd64129e4c4b6919549d9a
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:58:36 2024 +0300

    Revert "Revert "sched: Stub prefer_high_cap""

    This reverts commit 8e1140bfebc45a509e28bc4962316487c2410792.

commit 64662ef0b3e7c6b11d7516ebc3f83483a1fa71a9
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:58:30 2024 +0300

    Revert "Revert "Revert "sched: separate boost signal from placement hint"""

    This reverts commit 79111a20d915fbc86c41239bbc4b0b4d10b364a5.

commit 1bac20806128df71140bd14c52b83dc1117ade38
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:57:50 2024 +0300

    Revert "Revert "Revert "sched/fair: check if mid capacity cpu exists"""

    This reverts commit 7508259cb6aad46cd1d4a0d9cdb2c918411e3b92.

commit 42e0ef5e1b4f56d6337c1a6f3504d275cbc9a043
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:56:58 2024 +0300

    Revert "Revert "Revert "sched: separate capacity margin for boosted tasks"""

    This reverts commit 435cf700fa3b7e968b90f0061cb95260f62ece7e.

commit eab623c38f37d7cc0973478f55256173ce3d213d
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:56:57 2024 +0300

    Revert "Revert "Revert "sched: change capacity margin down to 20% for non-boost task"""

    This reverts commit 0567c2dfa0b34ad7b492f075d5ce3bae4d84977f.

commit 81f6c4feb22ee8da71f7adde5513ce1ae8a48346
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:56:57 2024 +0300

    Revert "Revert "Revert "sched/fair: use actual cpu capacity to calculate boosted util"""

    This reverts commit 9468b2ce801ec8799f68742e52aa1ed58d587a21.

commit c5bff7085b144149dc1ed774a3312c19a1669103
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:56:57 2024 +0300

    Revert "Revert "Revert "sched/fair: do not use boosted margin for prefer_high_cap case"""

    This reverts commit 25f6b813b40bc4eddf9dee0ea675208e8ff624ef.

commit d5cc737eb99b7aeced3b5d1546799ce1cffe0129
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:56:57 2024 +0300

    Revert "Revert "Revert "Revert "sched/fair: use actual cpu capacity to calculate boosted util""""

    This reverts commit 607d0c1f5ac9dff3b34cc7874d9b07c5d2b71bfb.

commit 944f85f208f1a520eb9759571d49f75db84b3ffc
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:56:50 2024 +0300

    Revert "sched/fair: Make boosted and prefer_idle tunables uclamp aware"

    This reverts commit 8d583957240d990d35b3ae951e09c914b77f7c46.

commit ff7fc5b1110dd9b31817b7186467938974ed8017
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:56:43 2024 +0300

    Revert "sched/uclamp: Fix incorrect uclamp.latency_sensitive setting"

    This reverts commit 282a8ac6f588cce1aebf7b01f3281cba98b4bf0d.

commit 4e5d3a1c362e4c3616126dfc3422d6207eeae977
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:56:43 2024 +0300

    Revert "sched/uclamp: Make uclamp_boosted() return proper boosted value"

    This reverts commit e1ffe11cccd14b460289c0d3e5f7bb0bb1717ac5.

commit b6ae08ea8bbe41f3c4b90b303e2c39a4c7c5f7b3
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:56:10 2024 +0300

    Revert "sched/uclamp: Allow to reset a task uclamp constraint value"

    This reverts commit b6b69eece539ebeed9bd2276a9a1dcedb28a7665.

commit 658ddb506825027979ee263aff8d55e0cff3dd98
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:55:14 2024 +0300

    Revert "UPSTREAM: sched/uclamp: Fix incorrect condition"

    This reverts commit ce1a62791ee282cad50a8c37ff6a0083174ceacd.

commit 1c5a70035dafad8750eafa8c5ee40b11115be18e
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:55:14 2024 +0300

    Revert "FROMGIT: sched/uclamp: Fix a bug in propagating uclamp value in new cgroups"

    This reverts commit 16d73cafeabe4c9726f170839baa325b9d6399fa.

commit ed892012da806647a464df46203dad13ca07e4d8
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:55:14 2024 +0300

    Revert "sched/uclamp: Fix locking around cpu_util_update_eff()"

    This reverts commit e954be1dbf9565e866a0a2e191d88916076a8892.

commit 505f58a688443900fdaeb4cca4e3ca66b0cf867c
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:55:13 2024 +0300

    Revert "sched/uclamp: Remove unnecessary mutex_init()"

    This reverts commit 31714e404d1478317f97bf068ddf8890e8c6df52.

commit 8f96c904caa429cb4e413bae75e746a38b9f1214
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:55:13 2024 +0300

    Revert "FROMLIST: sched: Fix out-of-bound access in uclamp"

    This reverts commit 973affb92e72ba195d187c904e4fe5ab0cd5e478.

commit 29df4134d5a23f686000ebfd31ee3edaf2f6bb44
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:55:13 2024 +0300

    Revert "sched/uclamp: Fix wrong implementation of cpu.uclamp.min"

    This reverts commit 07c3a9af2f16b0bdbda148b62f64441075d1eafb.

commit 4efefc8fcc48eaff905fc548099b5c5f181e8c8c
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:55:13 2024 +0300

    Revert "sched/uclamp: Fix uclamp_tg_restrict()"

    This reverts commit 76563866876422268a39e26ade95c39c73933f4b.

commit 7d5d5fe448300a61d6718919d3c38277e32285db
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:55:13 2024 +0300

    Revert "sched/uclamp: Ignore max aggregation if rq is idle"

    This reverts commit 08a6710f896689e691316ea2f617bc8524badf4c.

commit 488c9711417e6fc0d07401e928e9523bd241915c
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:55:13 2024 +0300

    Revert "ANDROID: sched: Make uclamp changes depend on CAP_SYS_NICE"

    This reverts commit 957d737358746d018b17a024d914b7637301e970.

commit 1ac8237c8049014ea11f5505c95fb0c91d5608d1
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:55:13 2024 +0300

    Revert "sched: Fix UCLAMP_FLAG_IDLE setting"

    This reverts commit e9a6a0f1062d90141caf298c4b1fd797056da4d6.

commit 0ef1e5ecb196bd556499ba0a7021a53ed87cadc4
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:54:15 2024 +0300

    Revert "sched/uclamp: Fix rq->uclamp_max not set on first enqueue"

    This reverts commit 977a80d7951e9cc356e85378bdbe856a6fab3fb8.

commit b69fc8ea5fbcc014e4a8fdafefb468dc348baa38
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:53:53 2024 +0300

    Revert "sched/fair: Reduce busy load balance interval"

    This reverts commit c54355abd4ab0e8d81c3ba5e5d22f9934183aabc.

commit 45daeebd161dab7ff6550030d0a8bcb302b2c271
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:53:40 2024 +0300

    Revert "sched/fair: Reduce minimal imbalance threshold"

    This reverts commit 47a4df59c1bc5de153cacc79c287a0b4626591e7.

commit 8557606f8d15209e2f68ff49485e62fb97a441c3
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:53:32 2024 +0300

    Revert "sched/deadline: Fix stale throttling on de-/boosted tasks"

    This reverts commit 1f2196de96395b2125287bb8df3838d67495b752.

commit 705944ee989af6d24932895c541abca24caa1b21
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:53:22 2024 +0300

    Revert "sched: fair: consider all running tasks in cpu for load balance"

    This reverts commit c1e04a4d1360eac58d9e731b93bb4676a08b9b99.

commit ecffc3948595a8cb3c246135082ee17ab7d414ca
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:53:03 2024 +0300

    Revert "sched/fair: Revert Google's capacity margin hacks"

    This reverts commit e7c82d8c4debb8e558d36eb81507f86e223298fa.

commit c064aca8d86c768eb1ebf8536c25e49718cb55b9
Author: kondors1995 <normandija1945@gmail.com>
Date:   Sat Aug 17 11:48:56 2024 +0300

    Revert "BACKPORT: sched/fair: Make task_fits_capacity() consider uclamp restrictions"

    This reverts commit a59a899e6a995cd2ba80a5b041d89353e6afb117.

# Conflicts:
#	include/linux/sched/sysctl.h
#	kernel/sched/fair.c
#	kernel/sched/sched.h
#	kernel/sched/tune.c
---
 Documentation/cgroup-v2.txt            | 34 ++++++++++
 Documentation/scheduler/sched-tune.txt |  4 +-
 include/linux/sched/sysctl.h           |  8 +--
 kernel/sched/core.c                    | 84 ++++++++++++++++++++++---
 kernel/sched/cpupri.c                  | 25 +++++++-
 kernel/sched/cpupri.h                  |  5 +-
 kernel/sched/fair.c                    | 72 ++++++---------------
 kernel/sched/rt.c                      | 81 +++++++++++++++++++-----
 kernel/sched/sched.h                   | 86 ++++++--------------------
 kernel/sched/tune.c                    |  5 +-
 kernel/sched/tune.h                    |  4 +-
 11 files changed, 252 insertions(+), 156 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 7e3672812510..6d247618dca2 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -902,6 +902,13 @@ controller implements weight and absolute bandwidth limit models for
 normal scheduling policy and absolute bandwidth allocation model for
 realtime scheduling policy.
 
+In all the above models, cycles distribution is defined only on a temporal
+base and it does not account for the frequency at which tasks are executed.
+The (optional) utilization clamping support allows to hint the schedutil
+cpufreq governor about the minimum desired frequency which should always be
+provided by a CPU, as well as the maximum desired frequency, which should not
+be exceeded by a CPU.
+
 
 CPU Interface Files
 ~~~~~~~~~~~~~~~~~~~
@@ -964,6 +971,33 @@ All time durations are in microseconds.
 	Shows pressure stall information for CPU. See
 	Documentation/accounting/psi.txt for details.
 
+  cpu.uclamp.min
+        A read-write single value file which exists on non-root cgroups.
+        The default is "0", i.e. no utilization boosting.
+
+        The requested minimum utilization (protection) as a percentage
+        rational number, e.g. 12.34 for 12.34%.
+
+        This interface allows reading and setting minimum utilization clamp
+        values similar to the sched_setattr(2). This minimum utilization
+        value is used to clamp the task specific minimum utilization clamp.
+
+        The requested minimum utilization (protection) is always capped by
+        the current value for the maximum utilization (limit), i.e.
+        `cpu.uclamp.max`.
+
+  cpu.uclamp.max
+        A read-write single value file which exists on non-root cgroups.
+        The default is "max". i.e. no utilization capping
+
+        The requested maximum utilization (limit) as a percentage rational
+        number, e.g. 98.76 for 98.76%.
+
+        This interface allows reading and setting maximum utilization clamp
+        values similar to the sched_setattr(2). This maximum utilization
+        value is used to clamp the task specific maximum utilization clamp.
+
+
 
 Memory
 ------
diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt
index 1a103715f7bd..be728705fe25 100644
--- a/Documentation/scheduler/sched-tune.txt
+++ b/Documentation/scheduler/sched-tune.txt
@@ -233,9 +233,9 @@ Thus, with the sched_cfs_boost enabled we have the following main functions to
 get the current utilization of a CPU:
 
   cpu_util()
-  boosted_cpu_util()
+  stune_util()
 
-The new boosted_cpu_util() is similar to the first but returns a boosted
+The new stune_util() is similar to the first but returns a boosted
 utilization signal which is a function of the sched_cfs_boost value.
 
 This function is used in the CFS scheduler code paths where schedutil needs to
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 2ee605775225..4091e5547a69 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -116,16 +116,16 @@ extern int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+extern int sched_updown_migrate_handler(struct ctl_table *table,
+					int write, void __user *buffer,
+					size_t *lenp, loff_t *ppos);
+
 #ifdef CONFIG_UCLAMP_TASK
 extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 				       void __user *buffer, size_t *lenp,
 				       loff_t *ppos);
 #endif
 
-extern int sched_updown_migrate_handler(struct ctl_table *table,
-					int write, void __user *buffer,
-					size_t *lenp, loff_t *ppos);
-
 extern int sysctl_numa_balancing(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa4811170da3..d64bc47e0532 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -45,6 +45,7 @@
 
 #include "sched.h"
 #include "walt.h"
+#include "tune.h"
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
 
@@ -800,7 +801,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
  * This knob will not override the system default sched_util_clamp_min defined
  * above.
  */
-unsigned int sysctl_sched_uclamp_util_min_rt_default = 0;
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
 
 /* All clamps are required to be less or equal than these values */
 static struct uclamp_se uclamp_default[UCLAMP_CNT];
@@ -836,12 +837,7 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
 	return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
 }
 
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
-{
-	return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
-}
-
-static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
+static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
 {
 	if (clamp_id == UCLAMP_MIN)
 		return 0;
@@ -884,7 +880,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
 }
 
 static inline
-enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
 				   unsigned int clamp_value)
 {
 	struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
@@ -1450,6 +1446,40 @@ static void uclamp_post_fork(struct task_struct *p)
 	uclamp_update_util_min_rt_default(p);
 }
 
+#ifdef CONFIG_SMP
+unsigned int uclamp_task(struct task_struct *p)
+{
+	unsigned long util;
+
+	util = task_util_est(p);
+	util = max(util, uclamp_eff_value(p, UCLAMP_MIN));
+	util = min(util, uclamp_eff_value(p, UCLAMP_MAX));
+
+	return util;
+}
+
+bool uclamp_boosted(struct task_struct *p)
+{
+	return uclamp_eff_value(p, UCLAMP_MIN) > 0;
+}
+
+bool uclamp_latency_sensitive(struct task_struct *p)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+	struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id);
+	struct task_group *tg;
+
+	if (!css)
+		return false;
+	tg = container_of(css, struct task_group, css);
+
+	return tg->latency_sensitive;
+#else
+	return false;
+#endif
+}
+#endif /* CONFIG_SMP */
+
 static void __init init_uclamp_rq(struct rq *rq)
 {
 	enum uclamp_id clamp_id;
@@ -1501,6 +1531,41 @@ static void __setscheduler_uclamp(struct task_struct *p,
 				  const struct sched_attr *attr) { }
 static inline void uclamp_fork(struct task_struct *p) { }
 static inline void uclamp_post_fork(struct task_struct *p) { }
+
+long schedtune_task_margin(struct task_struct *task);
+
+#ifdef CONFIG_SMP
+unsigned int uclamp_task(struct task_struct *p)
+{
+	unsigned long util = task_util_est(p);
+#ifdef CONFIG_SCHED_TUNE
+	long margin = schedtune_task_margin(p);
+
+	trace_sched_boost_task(p, util, margin);
+
+	util += margin;
+#endif
+
+	return util;
+}
+
+bool uclamp_boosted(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_TUNE
+	return schedtune_task_boost(p) > 0;
+#endif
+	return false;
+}
+
+bool uclamp_latency_sensitive(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_TUNE
+	return schedtune_prefer_idle(p) != 0;
+#endif
+	return false;
+}
+#endif /* CONFIG_SMP */
+
 static inline void init_uclamp(void) { }
 #endif /* CONFIG_UCLAMP_TASK */
 
@@ -8273,6 +8338,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
 	enum uclamp_id clamp_id;
 	unsigned int clamps;
 
+	lockdep_assert_held(&uclamp_mutex);
+	SCHED_WARN_ON(!rcu_read_lock_held());
+
 	css_for_each_descendant_pre(css, top_css) {
 		uc_parent = css_tg(css)->parent
 			? css_tg(css)->parent->uclamp : NULL;
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index f37daebed44e..487e4fdf5055 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -55,6 +55,8 @@ static int convert_prio(int prio)
  * @cp: The cpupri context
  * @p: The task
  * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
+ * @fitness_fn: A pointer to a function to do custom checks whether the CPU
+ *              fits a specific criteria so that we only return those CPUs.
  *
  * Note: This function returns the recommended CPUs as calculated during the
  * current invocation.  By the time the call returns, the CPUs may have in
@@ -66,7 +68,8 @@ static int convert_prio(int prio)
  * Return: (int)bool - CPUs were found
  */
 int cpupri_find(struct cpupri *cp, struct task_struct *p,
-		struct cpumask *lowest_mask)
+		struct cpumask *lowest_mask,
+		bool (*fitness_fn)(struct task_struct *p, int cpu))
 {
 	int idx = 0;
 	int task_pri = convert_prio(p->prio);
@@ -107,6 +110,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 			continue;
 
 		if (lowest_mask) {
+			int cpu;
+
 			cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
 			cpumask_andnot(lowest_mask, lowest_mask,
 				       cpu_isolated_mask);
@@ -119,7 +124,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 			 * condition, simply act as though we never hit this
 			 * priority level and continue on.
 			 */
-			if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+			if (cpumask_empty(lowest_mask))
+				continue;
+
+			if (!fitness_fn)
+				return 1;
+
+			/* Ensure the capacity of the CPUs fit the task */
+			for_each_cpu(cpu, lowest_mask) {
+				if (!fitness_fn(p, cpu))
+					cpumask_clear_cpu(cpu, lowest_mask);
+			}
+
+			/*
+			 * If no CPU at the current priority can fit the task
+			 * continue looking
+			 */
+			if (cpumask_empty(lowest_mask))
 				continue;
 		}
 
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index bab050019071..c08add835730 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -22,8 +22,9 @@ struct cpupri {
 };
 
 #ifdef CONFIG_SMP
-int  cpupri_find(struct cpupri *cp,
-		 struct task_struct *p, struct cpumask *lowest_mask);
+int  cpupri_find(struct cpupri *cp, struct task_struct *p,
+		 struct cpumask *lowest_mask,
+		 bool (*fitness_fn)(struct task_struct *p, int cpu));
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
 int cpupri_init(struct cpupri *cp);
 void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 797fd81f470c..339ae3761ba6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4113,7 +4113,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
 	return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
 }
 
-static inline unsigned long task_util_est(struct task_struct *p)
+unsigned long task_util_est(struct task_struct *p)
 {
 #ifdef CONFIG_SCHED_WALT
 	if (likely(!walt_disabled && sysctl_sched_use_walt_task_util))
@@ -4132,7 +4132,7 @@ static inline unsigned long uclamp_task_util(struct task_struct *p)
 #else
 static inline unsigned long uclamp_task_util(struct task_struct *p)
 {
-	return boosted_task_util(p);
+	return task_util_est(p);
 }
 #endif
 
@@ -6875,19 +6875,23 @@ schedtune_margin(unsigned long signal, long boost, long capacity)
 	return margin;
 }
 
-static inline int
-schedtune_cpu_margin(unsigned long util, int cpu)
+inline long
+schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
 {
-	int boost = schedtune_cpu_boost(cpu);
+	int boost = schedtune_cpu_boost_with(cpu, p);
+	long margin;
 
 	if (boost == 0)
-		return 0;
+		margin = 0;
+	else
+		margin = schedtune_margin(util, boost);
 
-	return schedtune_margin(util, boost, capacity_orig_of(cpu));
+	trace_sched_boost_cpu(cpu, util, margin);
+
+	return margin;
 }
 
-static inline long
-schedtune_task_margin(struct task_struct *task)
+long schedtune_task_margin(struct task_struct *task)
 {
 	int boost = schedtune_task_boost(task);
 	unsigned long util;
@@ -6904,50 +6908,14 @@ schedtune_task_margin(struct task_struct *task)
 
 #else /* CONFIG_SCHED_TUNE */
 
-static inline int
-schedtune_cpu_margin(unsigned long util, int cpu)
-{
-	return 0;
-}
-
-static inline int
-schedtune_task_margin(struct task_struct *task)
+inline long
+schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
 {
 	return 0;
 }
 
 #endif /* CONFIG_SCHED_TUNE */
 
-unsigned long
-boosted_cpu_util(int cpu, struct sched_walt_cpu_load *walt_load)
-{
-	unsigned long util = cpu_util_freq(cpu, walt_load);
-	long margin = schedtune_cpu_margin(util, cpu);
-
-	trace_sched_boost_cpu(cpu, util, margin);
-
-	return util + margin;
-}
-
-static inline unsigned long
-boosted_task_util(struct task_struct *task)
-{
-#ifdef CONFIG_UCLAMP_TASK_GROUP
-	unsigned long util = task_util_est(task);
-	unsigned long util_min = uclamp_eff_value(task, UCLAMP_MIN);
-	unsigned long util_max = uclamp_eff_value(task, UCLAMP_MAX);
-
-	return clamp(util, util_min, util_max);
-#else
-	unsigned long util = task_util_est(task);
-	long margin = schedtune_task_margin(task);
-
-	trace_sched_boost_task(task, util, margin);
-
-	return util + margin;
-#endif
-}
-
 static unsigned long cpu_util_without(int cpu, struct task_struct *p);
 
 static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
@@ -7465,7 +7433,7 @@ static inline int select_idle_sibling_cstate_aware(struct task_struct *p, int pr
 					continue;
 
 				/* figure out if the task can fit here at all */
-				new_usage = boosted_task_util(p);
+				new_usage = uclamp_task(p);
 				capacity_orig = capacity_orig_of(i);
 
 				if (new_usage > capacity_orig)
@@ -7627,7 +7595,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
 				   bool prefer_idle,
 				   struct find_best_target_env *fbt_env)
 {
-	unsigned long min_util = boosted_task_util(p);
+	unsigned long min_util = uclamp_task(p);
 	unsigned long target_capacity = ULONG_MAX;
 	unsigned long min_wake_util = ULONG_MAX;
 	unsigned long target_max_spare_cap = 0;
@@ -7724,10 +7692,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
 			if (sched_cpu_high_irqload(i))
 				continue;
 
-			/* Skip CPUs which do not fit task requirements */
-			if (capacity_of(i) < boosted_task_util(p))
-				continue;
-
 			/*
 			 * p's blocked utilization is still accounted for on prev_cpu
 			 * so prev_cpu will receive a negative bias due to the double
@@ -8280,7 +8244,7 @@ static inline struct energy_env *get_eenv(struct task_struct *p, int prev_cpu)
 	 * util for group utilization calculations
 	 */
 	eenv->util_delta = task_util_est(p);
-	eenv->util_delta_boosted = boosted_task_util(p);
+	eenv->util_delta_boosted = uclamp_task(p);
 
 	cpumask_and(&cpumask_possible_cpus, p->cpus_ptr, cpu_online_mask);
 	eenv->max_cpu_count = cpumask_weight(&cpumask_possible_cpus);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d6ece8c45020..8cd41e4b39e9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -458,6 +458,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 	return rt_se->on_rq;
 }
 
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the uclamp
+ * settings.
+ *
+ * This check is only important for heterogeneous systems where uclamp_min value
+ * is higher than the capacity of a @cpu. For non-heterogeneous system this
+ * function will always return true.
+ *
+ * The function will return true if the capacity of the @cpu is >= the
+ * uclamp_min and false otherwise.
+ *
+ * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
+ * > uclamp_max.
+ */
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+	unsigned int min_cap;
+	unsigned int max_cap;
+	unsigned int cpu_cap;
+
+	/* Only heterogeneous systems can benefit from this check */
+	if (!static_branch_unlikely(&sched_asym_cpucapacity))
+		return true;
+
+	min_cap = uclamp_eff_value(p, UCLAMP_MIN);
+	max_cap = uclamp_eff_value(p, UCLAMP_MAX);
+
+	cpu_cap = capacity_orig_of(cpu);
+
+	return cpu_cap >= min(min_cap, max_cap);
+}
+#else
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+	return true;
+}
+#endif
+
 #ifdef CONFIG_RT_GROUP_SCHED
 
 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
@@ -1481,6 +1520,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
 	struct task_struct *curr;
 	struct rq *rq;
+	bool test;
 
 	/* For anything but wake ups, just return the task_cpu */
 	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1512,11 +1552,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	 *
 	 * This test is optimistic, if we get it wrong the load-balancer
 	 * will have to sort it out.
+	 *
+	 * We take into account the capacity of the CPU to ensure it fits the
+	 * requirement of the task - which is only important on heterogeneous
+	 * systems like big.LITTLE.
 	 */
-	if (energy_aware() ||
+	test = energy_aware() ||
 	    (curr && unlikely(rt_task(curr)) &&
 	     (curr->nr_cpus_allowed < 2 ||
-	      curr->prio <= p->prio))) {
+	      curr->prio <= p->prio));
+
+	if (test || !rt_task_fits_capacity(p, cpu)) {
 		int target = find_lowest_rq(p);
 
 		/*
@@ -1540,15 +1586,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 * let's hope p can move out.
 	 */
 	if (rq->curr->nr_cpus_allowed == 1 ||
-	    !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+	    !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL))
 		return;
 
 	/*
 	 * p is migratable, so let's not schedule it and
 	 * see if it is pushed or pulled somewhere else.
 	 */
-	if (p->nr_cpus_allowed != 1
-	    && cpupri_find(&rq->rd->cpupri, p, NULL))
+	if (p->nr_cpus_allowed != 1 &&
+	    cpupri_find(&rq->rd->cpupri, p, NULL, NULL))
 		return;
 
 	/*
@@ -1706,7 +1752,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, p->cpus_ptr))
+	    cpumask_test_cpu(cpu, p->cpus_ptr) &&
+	    rt_task_fits_capacity(p, cpu))
 		return 1;
 	return 0;
 }
@@ -1850,7 +1897,8 @@ static int find_lowest_rq(struct task_struct *task)
 	if (task->nr_cpus_allowed == 1)
 		return -1; /* No other targets possible */
 
-	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask,
+			 rt_task_fits_capacity))
 		return -1; /* No targets found */
 
 	if (energy_aware())
@@ -2368,12 +2416,14 @@ skip:
  */
 static void task_woken_rt(struct rq *rq, struct task_struct *p)
 {
-	if (!task_running(rq, p) &&
-	    !test_tsk_need_resched(rq->curr) &&
-	    p->nr_cpus_allowed > 1 &&
-	    (dl_task(rq->curr) || rt_task(rq->curr)) &&
-	    (rq->curr->nr_cpus_allowed < 2 ||
-	     rq->curr->prio <= p->prio))
+	bool need_to_push = !task_running(rq, p) &&
+			    !test_tsk_need_resched(rq->curr) &&
+			    p->nr_cpus_allowed > 1 &&
+			    (dl_task(rq->curr) || rt_task(rq->curr)) &&
+			    (rq->curr->nr_cpus_allowed < 2 ||
+			     rq->curr->prio <= p->prio);
+
+	if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq)))
 		push_rt_tasks(rq);
 }
 
@@ -2446,7 +2496,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+		bool need_to_push = rq->rt.overloaded ||
+				    !rt_task_fits_capacity(p, cpu_of(rq));
+
+		if (p->nr_cpus_allowed > 1 && need_to_push)
 			queue_push_tasks(rq);
 #endif /* CONFIG_SMP */
 		if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d2fa46b12a42..9fb0b3530f94 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -427,8 +427,6 @@ struct task_group {
 	struct uclamp_se	uclamp[UCLAMP_CNT];
 	/* Latency-sensitive flag used for a task group */
 	unsigned int		latency_sensitive;
-	/* Boosted flag for a task group */
-	unsigned int 		boosted;
 #endif
 
 };
@@ -953,15 +951,16 @@ struct rq {
 	unsigned long nr_load_updates;
 	u64 nr_switches;
 
+	struct cfs_rq cfs;
+	struct rt_rq rt;
+	struct dl_rq dl;
+
 #ifdef CONFIG_UCLAMP_TASK
 	/* Utilization clamp values based on CPU's RUNNABLE tasks */
 	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
 	unsigned int		uclamp_flags;
 #define UCLAMP_FLAG_IDLE 0x01
 #endif
-	struct cfs_rq cfs;
-	struct rt_rq rt;
-	struct dl_rq dl;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -2323,7 +2322,7 @@ cpu_util_freq_walt(int cpu, struct sched_walt_cpu_load *walt_load)
 static inline unsigned long
 cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load)
 {
-	return cpu_util_freq_walt(cpu, walt_load);
+	return min(cpu_util(cpu), capacity_orig_of(cpu));
 }
 
 #else
@@ -2412,12 +2411,6 @@ static inline unsigned long cpu_util(int cpu)
 	return min(__cpu_util(cpu) + cpu_util_rt(cpu_rq(cpu)), capacity_orig_of(cpu));
 }
 
-static inline unsigned long
-cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load)
-{
-	return min(cpu_util(cpu), capacity_orig_of(cpu));
-}
-
 extern unsigned int capacity_margin_freq;
 
 static inline unsigned long
@@ -2774,6 +2767,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
 
+#ifdef CONFIG_SCHED_WALT
+
+static inline bool
+walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+	return cpu_of(rq) == task_cpu(p) &&
+	       (p->on_rq || p->last_sleep_ts >= rq->window_start);
+}
+
+#endif /* CONFIG_SCHED_WALT */
+
 #ifdef CONFIG_UCLAMP_TASK
 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
 
@@ -2892,60 +2896,10 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 }
 #endif /* CONFIG_UCLAMP_TASK */
 
-#ifdef CONFIG_UCLAMP_TASK_GROUP
-static inline bool uclamp_latency_sensitive(struct task_struct *p)
-{
-	struct cgroup_subsys_state *css = task_css(p, cpuset_cgrp_id);
-	struct task_group *tg;
-
-	if (!css)
-		return false;
-
-	if (!strlen(css->cgroup->kn->name))
-		return 0;
-
-	tg = container_of(css, struct task_group, css);
-
-	return tg->latency_sensitive;
-}
-
-static inline bool uclamp_boosted(struct task_struct *p)
-{
-	struct cgroup_subsys_state *css = task_css(p, cpuset_cgrp_id);
-	struct task_group *tg;
-
-	if (!css)
-		return false;
-
-	if (!strlen(css->cgroup->kn->name))
-		return 0;
-
-	tg = container_of(css, struct task_group, css);
-
-	return tg->boosted;
-}
-#else
-static inline bool uclamp_latency_sensitive(struct task_struct *p)
-{
-	return false;
-}
-
-static inline bool uclamp_boosted(struct task_struct *p)
-{
-	return false;
-}
-#endif /* CONFIG_UCLAMP_TASK_GROUP */
-
-#ifdef CONFIG_SCHED_WALT
-
-static inline bool
-walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
-{
-	return cpu_of(rq) == task_cpu(p) &&
-	       (p->on_rq || p->last_sleep_ts >= rq->window_start);
-}
-
-#endif /* CONFIG_SCHED_WALT */
+unsigned long task_util_est(struct task_struct *p);
+unsigned int uclamp_task(struct task_struct *p);
+bool uclamp_latency_sensitive(struct task_struct *p);
+bool uclamp_boosted(struct task_struct *p);
 
 #ifdef arch_scale_freq_capacity
 #ifndef arch_scale_freq_invariant
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index eb1028697665..5f5c94e2a1df 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -529,10 +529,11 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu)
 	raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
 }
 
-int schedtune_cpu_boost(int cpu)
+int schedtune_cpu_boost_with(int cpu, struct task_struct *p)
 {
 	struct boost_groups *bg;
 	u64 now;
+	int task_boost = p ? schedtune_task_boost(p) : -100;
 
 	bg = &per_cpu(cpu_boost_groups, cpu);
 	now = sched_clock_cpu(cpu);
@@ -541,7 +542,7 @@ int schedtune_cpu_boost(int cpu)
 	if (schedtune_boost_timeout(now, bg->boost_ts))
 		schedtune_cpu_update(cpu, now);
 
-	return bg->boost_max;
+	return max(bg->boost_max, task_boost);
 }
 
 int schedtune_task_boost(struct task_struct *p)
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index 9508c151a42b..4ab18eddd8e6 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -12,7 +12,7 @@ struct target_nrg {
 	struct reciprocal_value rdiv;
 };
 
-int schedtune_cpu_boost(int cpu);
+int schedtune_cpu_boost_with(int cpu, struct task_struct *p);
 int schedtune_task_boost(struct task_struct *tsk);
 int schedtune_task_boost_rcu_locked(struct task_struct *tsk);
 
@@ -23,7 +23,7 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu);
 
 #else /* CONFIG_SCHED_TUNE */
 
-#define schedtune_cpu_boost(cpu)  0
+#define schedtune_cpu_boost_with(cpu, p)  0
 #define schedtune_task_boost(tsk) 0
 
 #define schedtune_prefer_idle(tsk) 0

From 4915f9a08afc3a883b4f011267469ff119d2cc69 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 8 May 2023 08:17:44 +0200
Subject: [PATCH 20/30] Revert "softirq: Let ksoftirqd do its job"

This reverts the following commits:

  4cd13c21b207 ("softirq: Let ksoftirqd do its job")
  3c53776e29f8 ("Mark HI and TASKLET softirq synchronous")
  1342d8080f61 ("softirq: Don't skip softirq execution when softirq thread is parking")

in a single change to avoid known bad intermediate states introduced by a
patch series reverting them individually.

Due to the mentioned commit, when the ksoftirqd threads take charge of
softirq processing, the system can experience high latencies.

In the past a few workarounds have been implemented for specific
side-effects of the initial ksoftirqd enforcement commit:

commit 1ff688209e2e ("watchdog: core: make sure the watchdog_worker is not deferred")
commit 8d5755b3f77b ("watchdog: softdog: fire watchdog even if softirqs do not get to run")
commit 217f69743681 ("net: busy-poll: allow preemption in sk_busy_loop()")
commit 3c53776e29f8 ("Mark HI and TASKLET softirq synchronous")

But the latency problem still exists in real-life workloads, see the link
below.

The reverted commit intended to solve a live-lock scenario that can now be
addressed with the NAPI threaded mode, introduced with commit 29863d41bb6e
("net: implement threaded-able napi poll loop support"), which is nowadays
in a pretty stable status.

While a complete solution to put softirq processing under nice resource
control would be preferable, that has proven to be a very hard task. In
the short term, remove the main pain point, and also simplify a bit the
current softirq implementation.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: netdev@vger.kernel.org
Link: https://lore.kernel.org/netdev/305d7742212cbe98621b16be782b0562f1012cb6.camel@redhat.com
Link: https://lore.kernel.org/r/57e66b364f1b6f09c9bc0316742c3b14f4ce83bd.1683526542.git.pabeni@redhat.com
(cherry picked from commit d15121be7485655129101f3960ae6add40204463)
Change-Id: If014afbfa3bb56f7c490a22b8334857c8308f901
Signed-off-by: Alexander Winkowski <dereference23@outlook.com>
---
 kernel/softirq.c | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4896a0eb178e..5f8ebcccbb0a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,21 +77,6 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
-/*
- * If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness,
- * unless we're doing some of the synchronous softirqs.
- */
-#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
-static bool ksoftirqd_running(unsigned long pending)
-{
-	struct task_struct *tsk = __this_cpu_read(ksoftirqd);
-
-	if (pending & SOFTIRQ_NOW_MASK)
-		return false;
-	return tsk && (tsk->state == TASK_RUNNING);
-}
-
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -333,7 +318,7 @@ asmlinkage __visible void do_softirq(void)
 
 	pending = local_softirq_pending();
 
-	if (pending && !ksoftirqd_running(pending))
+	if (pending)
 		do_softirq_own_stack();
 
 	local_irq_restore(flags);
@@ -360,9 +345,6 @@ void irq_enter(void)
 
 static inline void invoke_softirq(void)
 {
-	if (ksoftirqd_running(local_softirq_pending()))
-		return;
-
 	if (!force_irqthreads) {
 #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
 		/*

From 6be9588eedcbc0df294cc69b78aa0a687ac91225 Mon Sep 17 00:00:00 2001
From: kondors1995 <normandija1945@gmail.com>
Date: Sun, 25 Aug 2024 22:06:20 +0300
Subject: [PATCH 21/30] Revert "sched/core: wake up from idle by sending
 IPI_WAKEUP"

This reverts commit f49348dd5a0e22fce053a15b335dc1daf9ecb922.
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d64bc47e0532..bfdd75dcab5a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2821,7 +2821,7 @@ void wake_up_if_idle(int cpu)
 	} else {
 		rq_lock_irqsave(rq, &rf);
 		if (is_idle_task(rq->curr))
-			arch_send_wakeup_ipi_mask(cpumask_of(cpu));
+			smp_send_reschedule(cpu);
 		/* Else CPU is not idle, do nothing here: */
 		rq_unlock_irqrestore(rq, &rf);
 	}

From 5ed0fda4620f5e9745b8963845f599921c89d262 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Sep 2021 22:16:02 +0200
Subject: [PATCH 22/30] sched: Simplify wake_up_*idle*()

Simplify and make wake_up_if_idle() more robust, also don't iterate
the whole machine with preempt_disable() in it's caller:
wake_up_all_idle_cpus().

This prepares for another wake_up_if_idle() user that needs a full
do_idle() cycle.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vasily Gorbik <gor@linux.ibm.com>
Tested-by: Vasily Gorbik <gor@linux.ibm.com> # on s390
Link: https://lkml.kernel.org/r/20210929152428.769328779@infradead.org
Change-Id: If9f0dd5d99dd675828656161bfdba299f447f998
(cherry picked from 8850cb663b5cda04d33f9cfbc38889d73d3c8e24)
Signed-off-by: Alexander Winkowski <dereference23@outlook.com>
---
 kernel/sched/core.c | 14 +++++---------
 kernel/smp.c        |  6 +++---
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bfdd75dcab5a..836ac6c3df85 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2816,15 +2816,11 @@ void wake_up_if_idle(int cpu)
 	if (!is_idle_task(rcu_dereference(rq->curr)))
 		goto out;
 
-	if (set_nr_if_polling(rq->idle)) {
-		trace_sched_wake_idle_without_ipi(cpu);
-	} else {
-		rq_lock_irqsave(rq, &rf);
-		if (is_idle_task(rq->curr))
-			smp_send_reschedule(cpu);
-		/* Else CPU is not idle, do nothing here: */
-		rq_unlock_irqrestore(rq, &rf);
-	}
+	rq_lock_irqsave(rq, &rf);
+	if (is_idle_task(rq->curr))
+		resched_curr(rq);
+	/* Else CPU is not idle, do nothing here: */
+	rq_unlock_irqrestore(rq, &rf);
 
 out:
 	rcu_read_unlock();
diff --git a/kernel/smp.c b/kernel/smp.c
index fd749ced516f..fcb8d7285c41 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -811,16 +811,16 @@ void wake_up_all_idle_cpus(void)
 {
 	int cpu;
 
-	preempt_disable();
+	cpus_read_lock();
 	for_each_online_cpu(cpu) {
-		if (cpu == smp_processor_id())
+		if (cpu == raw_smp_processor_id())
 			continue;
 
 		if (s2idle_state == S2IDLE_STATE_ENTER ||
 		    !cpu_isolated(cpu))
 			wake_up_if_idle(cpu);
 	}
-	preempt_enable();
+	cpus_read_unlock();
 }
 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
 

From 73bad98694733f43495f91376198b9e593241277 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 18 Oct 2021 16:41:05 +0200
Subject: [PATCH 23/30] sched: Improve wake_up_all_idle_cpus() take #2

As reported by syzbot and experienced by Pavel, using cpus_read_lock()
in wake_up_all_idle_cpus() generates lock inversion (against mmap_sem
and possibly others).

Instead, shrink the preempt disable region by iterating all CPUs and
checking the online status for each individual CPU while having
preemption disabled.

Fixes: 8850cb663b5c ("sched: Simplify wake_up_*idle*()")
Reported-by: syzbot+d5b23b18d2f4feae8a67@syzkaller.appspotmail.com
Reported-by: Pavel Machek <pavel@ucw.cz>
Reported-by: Qian Cai <quic_qiancai@quicinc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Qian Cai <quic_qiancai@quicinc.com>
Change-Id: I652eb678e8a2e30d71beeebae35a36d4d4c49a8d
(cherry picked from 96611c26dc351c33f73b48756a9feacc109e5bab)
Signed-off-by: Alexander Winkowski <dereference23@outlook.com>

# Conflicts:
#	kernel/smp.c
---
 kernel/smp.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index fcb8d7285c41..bac329f2b441 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -811,16 +811,12 @@ void wake_up_all_idle_cpus(void)
 {
 	int cpu;
 
-	cpus_read_lock();
-	for_each_online_cpu(cpu) {
-		if (cpu == raw_smp_processor_id())
-			continue;
-
-		if (s2idle_state == S2IDLE_STATE_ENTER ||
-		    !cpu_isolated(cpu))
+	for_each_possible_cpu(cpu) {
+		preempt_disable();
+		if (cpu != smp_processor_id() && cpu_online(cpu))
 			wake_up_if_idle(cpu);
+		preempt_enable();
 	}
-	cpus_read_unlock();
 }
 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
 

From be5603e38e45e5a4809d5460720a06f1a3b57397 Mon Sep 17 00:00:00 2001
From: kondors1995 <normandija1945@gmail.com>
Date: Sun, 25 Aug 2024 22:12:59 +0300
Subject: [PATCH 24/30] sched:remove unused value

---
 kernel/sched/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 836ac6c3df85..ad722395ce14 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4182,7 +4182,6 @@ void scheduler_tick(void)
 	bool early_notif;
 	u32 old_load;
 	struct related_thread_group *grp;
-	unsigned int flag = 0;
 	unsigned long thermal_pressure;
 
 	sched_clock_tick();

From ea10cc32c6677019cd1cc0322f90e9d4847ff246 Mon Sep 17 00:00:00 2001
From: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>
Date: Wed, 22 May 2024 15:56:14 +0800
Subject: [PATCH 25/30] cpufreq: schedutil: Checkout to msm-4.19

* HEAD: https://github.com/EmanuelCN/kernel_xiaomi_sm8250/commit/2c85e5f15e7f5467949d619e9d445dc734984e10
* Remove WALT and SchedTune pieces as we won't use them anymore
* Remove SCHED_CPUFREQ_RT_DL in include/linux/sched/cpufreq.h as it's unused now
* Add some new scheduler definitions from msm-4.19 for new schedutil
* Optimize freq switching for sm8150 (sm8150 use software freq switching)
* Minor format and code adjustments (compared to HEAD)

* Note: The new schedutil requires UClamp, which will be backported later

Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>
---
 include/linux/sched/cpufreq.h    |   2 -
 kernel/sched/cpufreq_schedutil.c | 104 ++++++++++----------
 kernel/sched/sched.h             | 162 ++++++++++++++++---------------
 3 files changed, 138 insertions(+), 130 deletions(-)

diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index f794f8c31298..7a4050b79ec4 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -18,8 +18,6 @@
 #define SCHED_CPUFREQ_FORCE_UPDATE (1U << 7)
 #define SCHED_CPUFREQ_CONTINUE (1U << 8)
 
-#define SCHED_CPUFREQ_RT_DL	(SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
-
 #ifdef CONFIG_CPU_FREQ
 struct update_util_data {
        void (*func)(struct update_util_data *data, u64 time, unsigned int flags);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 064e09359a6c..8d95439408eb 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -108,6 +108,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 		return true;
 	}
 
+	/* If the last frequency wasn't set yet then we can still amend it */
+	if (sg_policy->work_in_progress)
+		return true;
+
 	/* No need to recalculate next freq for min_rate_limit_us
 	 * at least. However we might still decide to further rate
 	 * limit once frequency change direction is decided, according
@@ -120,7 +124,11 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 
 static inline bool use_pelt(void)
 {
+#ifdef CONFIG_SCHED_WALT
+	return false;
+#else
 	return true;
+#endif
 }
 
 static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
@@ -248,9 +256,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 	return l_freq;
 }
 
-extern long
-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p);
-
 /*
  * This function computes an effective utilization for the given CPU, to be
  * used for frequency selection given the linear relation: f = u * f_max.
@@ -357,11 +362,10 @@ unsigned long apply_dvfs_headroom(int cpu, unsigned long util, unsigned long max
 	if (!util || util >= max_cap)
 		return util;
 
-	if (cpumask_test_cpu(cpu, cpu_lp_mask)) {
+	if (cpumask_test_cpu(cpu, cpu_lp_mask))
 		headroom = util + (util >> 1);
-	} else {
+	else
 		headroom = util + (util >> 2);
-	}
 
 	return headroom;
 }
@@ -543,8 +547,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	unsigned long max_cap;
 	unsigned int next_f;
+	unsigned long boost;
 	bool busy;
-        unsigned long boost;
 
 	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
 
@@ -601,7 +605,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 
 	for_each_cpu(j, policy->cpus) {
 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
-	        unsigned long boost;
+		unsigned long boost;
 		s64 delta_ns;
 
 		/*
@@ -761,6 +765,28 @@ static struct attribute *sugov_attributes[] = {
 	NULL
 };
 
+static void sugov_tunables_save(struct cpufreq_policy *policy,
+		struct sugov_tunables *tunables)
+{
+	int cpu;
+	struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
+
+	if (!have_governor_per_policy())
+		return;
+
+	if (!cached) {
+		cached = kzalloc(sizeof(*tunables), GFP_KERNEL);
+		if (!cached)
+			return;
+
+		for_each_cpu(cpu, policy->related_cpus)
+			per_cpu(cached_tunables, cpu) = cached;
+	}
+
+	cached->up_rate_limit_us = tunables->up_rate_limit_us;
+	cached->down_rate_limit_us = tunables->down_rate_limit_us;
+}
+
 static void sugov_tunables_free(struct kobject *kobj)
 {
 	struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj);
@@ -768,6 +794,19 @@ static void sugov_tunables_free(struct kobject *kobj)
 	kfree(to_sugov_tunables(attr_set));
 }
 
+static void sugov_tunables_restore(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy = policy->governor_data;
+	struct sugov_tunables *tunables = sg_policy->tunables;
+	struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
+
+	if (!cached)
+		return;
+
+	tunables->up_rate_limit_us = cached->up_rate_limit_us;
+	tunables->down_rate_limit_us = cached->down_rate_limit_us;
+}
+
 static struct kobj_type sugov_tunables_ktype = {
 	.default_attrs = sugov_attributes,
 	.sysfs_ops = &governor_sysfs_ops,
@@ -825,7 +864,8 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
 	}
 
 	sg_policy->thread = thread;
-	kthread_bind_mask(thread, policy->related_cpus);
+	if (!policy->dvfs_possible_from_any_cpu)
+		kthread_bind_mask(thread, policy->related_cpus);
 	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
 	mutex_init(&sg_policy->work_lock);
 
@@ -858,48 +898,12 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic
 	return tunables;
 }
 
-static void sugov_tunables_save(struct cpufreq_policy *policy,
-		struct sugov_tunables *tunables)
-{
-	int cpu;
-	struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
-
-	if (!have_governor_per_policy())
-		return;
-
-	if (!cached) {
-		cached = kzalloc(sizeof(*tunables), GFP_KERNEL);
-		if (!cached)
-			return;
-
-		for_each_cpu(cpu, policy->related_cpus)
-			per_cpu(cached_tunables, cpu) = cached;
-	}
-
-	cached->up_rate_limit_us = tunables->up_rate_limit_us;
-	cached->down_rate_limit_us = tunables->down_rate_limit_us;
-}
-
-
 static void sugov_clear_global_tunables(void)
 {
 	if (!have_governor_per_policy())
 		global_tunables = NULL;
 }
 
-static void sugov_tunables_restore(struct cpufreq_policy *policy)
-{
-	struct sugov_policy *sg_policy = policy->governor_data;
-	struct sugov_tunables *tunables = sg_policy->tunables;
-	struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
-
-	if (!cached)
-		return;
-
-	tunables->up_rate_limit_us = cached->up_rate_limit_us;
-	tunables->down_rate_limit_us = cached->down_rate_limit_us;
-}
-
 static int sugov_init(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy;
@@ -989,13 +993,15 @@ static void sugov_exit(struct cpufreq_policy *policy)
 
 	mutex_lock(&global_tunables_lock);
 
+	/* Save tunables before last owner release it in gov_attr_set_put() */
+	if (tunables->attr_set.usage_count == 1)
+		sugov_tunables_save(policy, tunables);
+
 	count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
 	policy->governor_data = NULL;
-
-	if (!count) {
-		sugov_tunables_save(policy, tunables);
+	if (!count)
 		sugov_clear_global_tunables();
-	}
+
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_kthread_stop(sg_policy);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9fb0b3530f94..2ef572842824 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2332,85 +2332,6 @@ cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load)
 
 #endif /* CONFIG_SCHED_WALT */
 
-#ifdef CONFIG_SMP
-static inline unsigned long cpu_util_cfs(struct rq *rq)
-{
-	unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
-
-	if (sched_feat(UTIL_EST)) {
-		util = max_t(unsigned long, util,
-			     READ_ONCE(rq->cfs.avg.util_est.enqueued));
-	}
-
-	return util;
-}
-#endif
-
-#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
-				 unsigned long *min,
-				 unsigned long *max);
-
-unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
-				 unsigned long min,
-				 unsigned long max);
-
-static inline unsigned long cpu_bw_dl(struct rq *rq)
-{
-	return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
-}
-
-static inline unsigned long cpu_util_dl(struct rq *rq)
-{
-	return READ_ONCE(rq->avg_dl.util_avg);
-}
-
-static inline unsigned long cpu_util_rt(struct rq *rq)
-{
-	return READ_ONCE(rq->avg_rt.util_avg);
-}
-#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
-				 unsigned long max, enum schedutil_type type,
-				 struct task_struct *p)
-{
-	return 0;
-}
-#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-
-#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
-static inline unsigned long cpu_util_irq(struct rq *rq)
-{
-	return rq->avg_irq.util_avg;
-}
-
-static inline
-unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
-{
-	util *= (max - irq);
-	util /= max;
-
-	return util;
-
-}
-#else
-static inline unsigned long cpu_util_irq(struct rq *rq)
-{
-	return 0;
-}
-
-static inline
-unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
-{
-	return util;
-}
-#endif
-
-static inline unsigned long cpu_util(int cpu)
-{
-	return min(__cpu_util(cpu) + cpu_util_rt(cpu_rq(cpu)), capacity_orig_of(cpu));
-}
-
 extern unsigned int capacity_margin_freq;
 
 static inline unsigned long
@@ -2909,6 +2830,89 @@ bool uclamp_boosted(struct task_struct *p);
 #define arch_scale_freq_invariant()	(false)
 #endif
 
+#ifdef CONFIG_SMP
+static inline unsigned long cpu_util_cfs(struct rq *rq)
+{
+	unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
+
+	if (sched_feat(UTIL_EST)) {
+		util = max_t(unsigned long, util,
+			     READ_ONCE(rq->cfs.avg.util_est.enqueued));
+	}
+
+	return util;
+}
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+				 unsigned long *min,
+				 unsigned long *max);
+
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+				 unsigned long min,
+				 unsigned long max);
+
+static inline unsigned long cpu_bw_dl(struct rq *rq)
+{
+	return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
+}
+
+static inline unsigned long cpu_util_dl(struct rq *rq)
+{
+	return READ_ONCE(rq->avg_dl.util_avg);
+}
+
+static inline unsigned long cpu_util_rt(struct rq *rq)
+{
+	return READ_ONCE(rq->avg_rt.util_avg);
+}
+#endif
+
+#ifdef CONFIG_SMP
+#ifndef CONFIG_SCHED_WALT
+static inline unsigned long cpu_util(int cpu)
+{
+	return min(__cpu_util(cpu) + cpu_util_rt(cpu_rq(cpu)),
+		   capacity_orig_of(cpu));
+}
+#endif
+#endif
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+	return rq->avg_irq.util_avg;
+}
+
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+	util *= (max - irq);
+	util /= max;
+
+	return util;
+
+}
+#else
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+	return 0;
+}
+
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+	return util;
+}
+#endif
+
+static inline unsigned long
+cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load)
+{
+	return min(cpu_util(cpu), capacity_orig_of(cpu));
+}
+
 enum sched_boost_policy {
 	SCHED_BOOST_NONE,
 	SCHED_BOOST_ON_BIG,

From de9a4993cee2769b3199108ba5be0a6dc8a794b6 Mon Sep 17 00:00:00 2001
From: EmanuelCN <emanuelghub@gmail.com>
Date: Fri, 23 Aug 2024 16:40:59 +0300
Subject: [PATCH 26/30] Revert "cpufreq: schedutil: Ignore CPU load older than
 WALT window size"

This reverts commit b9d1ecec68e9750f819dca02af47caee55d06a58.
---
 kernel/sched/cpufreq_schedutil.c | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 8d95439408eb..6cb0730e089e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -75,7 +75,6 @@ struct sugov_cpu {
 };
 
 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
-static unsigned int stale_ns;
 static DEFINE_PER_CPU(struct sugov_tunables *, cached_tunables);
 
 /************************ Governor internals ***********************/
@@ -606,21 +605,6 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 	for_each_cpu(j, policy->cpus) {
 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
 		unsigned long boost;
-		s64 delta_ns;
-
-		/*
-		 * If the CPU utilization was last updated before the previous
-		 * frequency update and the time elapsed between the last update
-		 * of the CPU utilization and the last frequency update is long
-		 * enough, don't take the CPU into account as it probably is
-		 * idle now (and clear iowait_boost for it).
-		 */
-		delta_ns = time - j_sg_cpu->last_update;
-		if (delta_ns > stale_ns) {
-			sugov_iowait_reset(j_sg_cpu, time, false);
-			continue;
-		}
-
 		boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
 		sugov_get_util(j_sg_cpu, boost);
 
@@ -952,8 +936,6 @@ static int sugov_init(struct cpufreq_policy *policy)
 	policy->governor_data = sg_policy;
 	sg_policy->tunables = tunables;
 
-	stale_ns = sched_ravg_window + (sched_ravg_window >> 3);
-
 	sugov_tunables_restore(policy);
 
 	ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,

From d9324db2412fde8a5f86aefa5ab30e1e0e4be1ce Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@kerneltoast.com>
Date: Wed, 28 Aug 2024 22:56:59 +0300
Subject: [PATCH 27/30] cpufreq: schedutil: Allow single-CPU frequency to drop
 without idling Given that a CPU's clock is gated at even the shallowest idle
 state, waiting until a CPU idles at least once before reducing its frequency
 is putting the cart before the horse. For long-running workloads with low
 compute needs, requiring an idle call since the last frequency update to
 lower the CPU's frequency results in significantly increased energy usage.

Given that there is already a mechanism in place to ratelimit frequency
changes, this heuristic is wholly unnecessary.

Allow single-CPU performance domains to drop their frequency without
requiring an idle call in between to improve energy. Right off the bat,
this reduces CPU power consumption by 7.5% playing a cat gif in Firefox on
a Pixel 8 (270 mW -> 250 mW). And there is no visible loss of performance.

Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
---
 kernel/sched/cpufreq_schedutil.c | 34 --------------------------------
 1 file changed, 34 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 6cb0730e089e..eb1df0c899a6 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -67,11 +67,6 @@ struct sugov_cpu {
 
 	unsigned long		util;
 	unsigned long		bw_min;
-
-	/* The field below is for single-CPU policies only: */
-#ifdef CONFIG_NO_HZ_COMMON
-	unsigned long		saved_idle_calls;
-#endif
 };
 
 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -516,19 +511,6 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
 	return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
 }
 
-#ifdef CONFIG_NO_HZ_COMMON
-static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
-{
-	unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
-	bool ret = idle_calls == sg_cpu->saved_idle_calls;
-
-	sg_cpu->saved_idle_calls = idle_calls;
-	return ret;
-}
-#else
-static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
-#endif /* CONFIG_NO_HZ_COMMON */
-
 /*
  * Make sugov_should_update_freq() ignore the rate limit when DL
  * has increased the utilization.
@@ -547,7 +529,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	unsigned long max_cap;
 	unsigned int next_f;
 	unsigned long boost;
-	bool busy;
 
 	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
 
@@ -559,25 +540,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	if (!sugov_should_update_freq(sg_policy, time))
 		return;
 
-	/* Limits may have changed, don't skip frequency update */
-	busy = use_pelt() && !sg_policy->need_freq_update &&
-		sugov_cpu_is_busy(sg_cpu);
-
 	boost = sugov_iowait_apply(sg_cpu, time, max_cap);
 	sugov_get_util(sg_cpu, boost);
 
 	next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
-	/*
-	 * Do not reduce the frequency if the CPU has not been idle
-	 * recently, as the reduction is likely to be premature then.
-	 */
-	if (busy && next_f < sg_policy->next_freq &&
-	    !sg_policy->need_freq_update) {
-		next_f = sg_policy->next_freq;
-
-		/* Restore cached freq as next_freq has changed */
-		sg_policy->cached_raw_freq = sg_policy->prev_cached_raw_freq;
-	}
 
 	/*
 	 * This code runs under rq->lock for the target CPU, so it won't run

From 1845102b9f56aaf2ad2843073e5cfc920556a357 Mon Sep 17 00:00:00 2001
From: Sheenam Monga <quic_shemonga@quicinc.com>
Date: Wed, 17 Apr 2024 15:39:57 +0530
Subject: [PATCH 28/30] BACKPORT: qcacmn: Fix potential OOB read in
 util_scan_parse_rnr_ie

Currently, while parsing scan RNR Ie data is moved to
next neighbor_ap_info_field after parsing the current
neighbor_ap_info_field. But in last iteration pointer may
try to access invalid data if (uint8_t *)ie + rnr_ie_len + 2)
bytes are less than sizeof neighbor_ap_info_field and same
is the case with tbtt_length access.

Fix is to add a length check of data + next data size to be parsed
< (uint8_t *)ie + rnr_ie_len + 2) instead of adding a validation
of data length only.

CRs-Fixed: 3710080
Change-Id: I05e5a9a02f0f4f9bc468db894588e676f0a248c0
---
 .../umac/scan/dispatcher/src/wlan_scan_utils_api.c          | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c b/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c
index 86c1c1b81a02..02ab28c5bc97 100644
--- a/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c
+++ b/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c
@@ -709,7 +709,8 @@ util_scan_parse_rnr_ie(struct scan_cache_entry *scan_entry,
 	rnr_ie_len = ie->ie_len;
 	data = (uint8_t *)ie + sizeof(struct ie_header);
 
-	while (data < ((uint8_t *)ie + rnr_ie_len + 2)) {
+	while ((data + sizeof(struct neighbor_ap_info_field)) <=
+					((uint8_t *)ie + rnr_ie_len + 2)) {
 		neighbor_ap_info = (struct neighbor_ap_info_field *)data;
 		tbtt_count = neighbor_ap_info->tbtt_header.tbtt_info_count;
 		tbtt_length = neighbor_ap_info->tbtt_header.tbtt_info_length;
@@ -725,7 +726,8 @@ util_scan_parse_rnr_ie(struct scan_cache_entry *scan_entry,
 			break;
 
 		for (i = 0; i < (tbtt_count + 1) &&
-		     data < ((uint8_t *)ie + rnr_ie_len + 2); i++) {
+		     (data + tbtt_length) <=
+				((uint8_t *)ie + rnr_ie_len + 2); i++) {
 			if (i < MAX_RNR_BSS)
 				util_scan_update_rnr(
 					&scan_entry->rnr.bss_info[i],

From d5d983781ff4d797e036d307b53aa8734d3232c7 Mon Sep 17 00:00:00 2001
From: John Galt <johngaltfirstrun@gmail.com>
Date: Fri, 16 Aug 2024 08:49:11 -0400
Subject: [PATCH 29/30] raphael_defconfig: enable audit + avc stats

---
 arch/arm64/configs/raphael_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/raphael_defconfig b/arch/arm64/configs/raphael_defconfig
index 05c1b41e1b5c..864dcfa430e1 100644
--- a/arch/arm64/configs/raphael_defconfig
+++ b/arch/arm64/configs/raphael_defconfig
@@ -2,6 +2,7 @@ CONFIG_TOOLS_SUPPORT_RELR=y
 CONFIG_LOCALVERSION="-SOVIET-STAR-"
 CONFIG_INLINE_OPTIMIZATION=y
 # CONFIG_FHANDLE is not set
+CONFIG_AUDIT=y
 CONFIG_IRQ_SBALANCE=y
 CONFIG_SBALANCE_EXCLUDE_CPUS="3,6,7"
 CONFIG_NO_HZ=y

From e5a1fc529ab0d6da058111756a8e80d0eef951ac Mon Sep 17 00:00:00 2001
From: kondors1995 <normandija1945@gmail.com>
Date: Sun, 1 Sep 2024 18:00:27 +0300
Subject: [PATCH 30/30] R6.5

---
 build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index 7b5c09c6c83b..bfe429a1f87f 100755
--- a/build.sh
+++ b/build.sh
@@ -15,7 +15,7 @@ export THINLTO_CACHE=~/ltocache/
 DEFCONFIG="raphael_defconfig"
 
 # Kernel Details
-REV="R6.4"
+REV="R6.5"
 
 EDITION="BLACK"
 VER="$EDITION"-"$REV"