diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 7e3672812510..6d247618dca2 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -902,6 +902,13 @@ controller implements weight and absolute bandwidth limit models for normal scheduling policy and absolute bandwidth allocation model for realtime scheduling policy. +In all the above models, cycles distribution is defined only on a temporal +base and it does not account for the frequency at which tasks are executed. +The (optional) utilization clamping support allows to hint the schedutil +cpufreq governor about the minimum desired frequency which should always be +provided by a CPU, as well as the maximum desired frequency, which should not +be exceeded by a CPU. + CPU Interface Files ~~~~~~~~~~~~~~~~~~~ @@ -964,6 +971,33 @@ All time durations are in microseconds. Shows pressure stall information for CPU. See Documentation/accounting/psi.txt for details. + cpu.uclamp.min + A read-write single value file which exists on non-root cgroups. + The default is "0", i.e. no utilization boosting. + + The requested minimum utilization (protection) as a percentage + rational number, e.g. 12.34 for 12.34%. + + This interface allows reading and setting minimum utilization clamp + values similar to the sched_setattr(2). This minimum utilization + value is used to clamp the task specific minimum utilization clamp. + + The requested minimum utilization (protection) is always capped by + the current value for the maximum utilization (limit), i.e. + `cpu.uclamp.max`. + + cpu.uclamp.max + A read-write single value file which exists on non-root cgroups. + The default is "max". i.e. no utilization capping + + The requested maximum utilization (limit) as a percentage rational + number, e.g. 98.76 for 98.76%. + + This interface allows reading and setting maximum utilization clamp + values similar to the sched_setattr(2). This maximum utilization + value is used to clamp the task specific maximum utilization clamp. + + Memory ------ diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt index 1a103715f7bd..be728705fe25 100644 --- a/Documentation/scheduler/sched-tune.txt +++ b/Documentation/scheduler/sched-tune.txt @@ -233,9 +233,9 @@ Thus, with the sched_cfs_boost enabled we have the following main functions to get the current utilization of a CPU: cpu_util() - boosted_cpu_util() + stune_util() -The new boosted_cpu_util() is similar to the first but returns a boosted +The new stune_util() is similar to the first but returns a boosted utilization signal which is a function of the sched_cfs_boost value. This function is used in the CFS scheduler code paths where schedutil needs to diff --git a/arch/arm64/configs/raphael_defconfig b/arch/arm64/configs/raphael_defconfig index c5f42126ad6e..2c9a7dc37b46 100644 --- a/arch/arm64/configs/raphael_defconfig +++ b/arch/arm64/configs/raphael_defconfig @@ -2,6 +2,7 @@ CONFIG_TOOLS_SUPPORT_RELR=y CONFIG_LOCALVERSION="-SOVIET-STAR-" CONFIG_INLINE_OPTIMIZATION=y # CONFIG_FHANDLE is not set +CONFIG_AUDIT=y CONFIG_IRQ_SBALANCE=y CONFIG_SBALANCE_EXCLUDE_CPUS="3,6,7" CONFIG_NO_HZ=y @@ -24,6 +25,7 @@ CONFIG_BLK_CGROUP=y CONFIG_CGROUP_SCHED=y # CONFIG_FAIR_GROUP_SCHED is not set CONFIG_UCLAMP_TASK_GROUP=y +CONFIG_UCLAMP_ASSIST=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y diff --git a/build.sh b/build.sh index 87f32665208a..1b112577e5bf 100755 --- a/build.sh +++ b/build.sh @@ -15,7 +15,7 @@ export THINLTO_CACHE=~/ltocache/ DEFCONFIG="raphael_defconfig" # Kernel Details -REV="R6.4" +REV="R6.5" EDITION="DSP" VER="$EDITION"-"$REV" diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 1992db8bda2c..47ac18c107bd 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -537,21 +537,11 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy) return policy->transition_delay_us; latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (latency) { - /* - * For platforms that can change the frequency very fast (< 10 - * us), the above formula gives a decent transition delay. But - * for platforms where transition_latency is in milliseconds, it - * ends up giving unrealistic values. - * - * Cap the default transition delay to 10 ms, which seems to be - * a reasonable amount of time after which we should reevaluate - * the frequency. - */ - return min(latency * LATENCY_MULTIPLIER, (unsigned int)10000); - } + if (latency) + /* Give a 50% breathing room between updates */ + return latency + (latency >> 1); - return LATENCY_MULTIPLIER; + return USEC_PER_MSEC; } EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us); @@ -1880,7 +1870,7 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, int ret; target_freq = clamp_val(target_freq, policy->min, policy->max); - ret = cpufreq_driver->fast_switch(policy, target_freq); + ret = cpufreq_driver->fast_switch(policy, target_freq); if (ret) { cpufreq_times_record_transition(policy, ret); cpufreq_stats_record_transition(policy, ret); diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 1284bc003031..bda35fe56b37 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -12,72 +12,124 @@ #include #include #include +#include #include struct cpufreq_stats { unsigned int total_trans; - atomic64_t last_time; + unsigned long long last_time; unsigned int max_state; unsigned int state_num; unsigned int last_index; - atomic64_t *time_in_state; + u64 *time_in_state; unsigned int *freq_table; unsigned int *trans_table; + + /* Deferred reset */ + unsigned int reset_pending; + unsigned long long reset_time; }; -static void cpufreq_stats_update(struct cpufreq_stats *stats) +static void cpufreq_stats_update(struct cpufreq_stats *stats, + unsigned long long time) { - unsigned long long cur_time = get_jiffies_64(); - unsigned long long time = cur_time; + unsigned long long cur_time = local_clock(); - time = atomic64_xchg(&stats->last_time, time); - atomic64_add(cur_time - time, &stats->time_in_state[stats->last_index]); + stats->time_in_state[stats->last_index] += cur_time - time; + stats->last_time = cur_time; } -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) { unsigned int count = stats->max_state; - memset(stats->time_in_state, 0, count * sizeof(atomic64_t)); + memset(stats->time_in_state, 0, count * sizeof(u64)); memset(stats->trans_table, 0, count * count * sizeof(int)); - atomic64_set(&stats->last_time, get_jiffies_64()); + stats->last_time = local_clock(); stats->total_trans = 0; + + /* Adjust for the time elapsed since reset was requested */ + WRITE_ONCE(stats->reset_pending, 0); + /* + * Prevent the reset_time read from being reordered before the + * reset_pending accesses in cpufreq_stats_record_transition(). + */ + smp_rmb(); + cpufreq_stats_update(stats, READ_ONCE(stats->reset_time)); } static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) { - return sprintf(buf, "%d\n", policy->stats->total_trans); + struct cpufreq_stats *stats = policy->stats; + + if (READ_ONCE(stats->reset_pending)) + return sprintf(buf, "%d\n", 0); + else + return sprintf(buf, "%u\n", stats->total_trans); } +cpufreq_freq_attr_ro(total_trans); static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) { struct cpufreq_stats *stats = policy->stats; + bool pending = READ_ONCE(stats->reset_pending); + unsigned long long time; ssize_t len = 0; int i; - cpufreq_stats_update(stats); for (i = 0; i < stats->state_num; i++) { + if (pending) { + if (i == stats->last_index) { + /* + * Prevent the reset_time read from occurring + * before the reset_pending read above. + */ + smp_rmb(); + time = local_clock() - READ_ONCE(stats->reset_time); + } else { + time = 0; + } + } else { + time = stats->time_in_state[i]; + if (i == stats->last_index) + time += local_clock() - stats->last_time; + } + len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], - (unsigned long long) - jiffies_64_to_clock_t(atomic64_read( - &stats->time_in_state[i]))); + nsec_to_clock_t(time)); } return len; } +cpufreq_freq_attr_ro(time_in_state); +/* We don't care what is written to the attribute */ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, size_t count) { - /* We don't care what is written to the attribute. */ - cpufreq_stats_clear_table(policy->stats); + struct cpufreq_stats *stats = policy->stats; + + /* + * Defer resetting of stats to cpufreq_stats_record_transition() to + * avoid races. + */ + WRITE_ONCE(stats->reset_time, local_clock()); + /* + * The memory barrier below is to prevent the readers of reset_time from + * seeing a stale or partially updated value. + */ + smp_wmb(); + WRITE_ONCE(stats->reset_pending, 1); + return count; } +cpufreq_freq_attr_wo(reset); static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) { struct cpufreq_stats *stats = policy->stats; + bool pending = READ_ONCE(stats->reset_pending); ssize_t len = 0; - int i, j; + int i, j, count; len += scnprintf(buf + len, PAGE_SIZE - len, " From : To\n"); len += scnprintf(buf + len, PAGE_SIZE - len, " : "); @@ -102,8 +154,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) for (j = 0; j < stats->state_num; j++) { if (len >= PAGE_SIZE) break; - len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", - stats->trans_table[i*stats->max_state+j]); + + if (pending) + count = 0; + else + count = stats->trans_table[i * stats->max_state + j]; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count); } if (len >= PAGE_SIZE) break; @@ -118,10 +175,6 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) } cpufreq_freq_attr_ro(trans_table); -cpufreq_freq_attr_ro(total_trans); -cpufreq_freq_attr_ro(time_in_state); -cpufreq_freq_attr_wo(reset); - static struct attribute *default_attrs[] = { &total_trans.attr, &time_in_state.attr, @@ -161,7 +214,7 @@ void cpufreq_stats_free_table(struct cpufreq_policy *policy) void cpufreq_stats_create_table(struct cpufreq_policy *policy) { - unsigned int i = 0, count = 0, ret = -ENOMEM; + unsigned int i = 0, count; struct cpufreq_stats *stats; unsigned int alloc_size; struct cpufreq_frequency_table *pos; @@ -178,7 +231,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy) if (!stats) return; - alloc_size = count * sizeof(int) + count * sizeof(atomic64_t); + alloc_size = count * sizeof(int) + count * sizeof(u64); alloc_size += count * count * sizeof(int); @@ -199,12 +252,11 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy) stats->freq_table[i++] = pos->frequency; stats->state_num = i; - atomic64_set(&stats->last_time, get_jiffies_64()); + stats->last_time = local_clock(); stats->last_index = freq_table_get_index(stats, policy->cur); policy->stats = stats; - ret = sysfs_create_group(&policy->kobj, &stats_attr_group); - if (!ret) + if (!sysfs_create_group(&policy->kobj, &stats_attr_group)) return; /* We failed, release resources */ @@ -220,10 +272,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, struct cpufreq_stats *stats = policy->stats; int old_index, new_index; - if (unlikely(!stats)) { - pr_debug("%s: No stats found\n", __func__); + if (unlikely(!stats)) return; - } + + if (unlikely(READ_ONCE(stats->reset_pending))) + cpufreq_stats_reset_table(stats); old_index = stats->last_index; new_index = freq_table_get_index(stats, new_freq); @@ -232,7 +285,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, if (unlikely(old_index == -1 || new_index == -1 || old_index == new_index)) return; - cpufreq_stats_update(stats); + cpufreq_stats_update(stats, stats->last_time); stats->last_index = new_index; stats->trans_table[old_index * stats->max_state + new_index]++; diff --git a/drivers/platform/msm/gsi/gsi.c b/drivers/platform/msm/gsi/gsi.c index 81b54bd38a2a..7b339e3643a6 100644 --- a/drivers/platform/msm/gsi/gsi.c +++ b/drivers/platform/msm/gsi/gsi.c @@ -581,7 +581,6 @@ static void gsi_process_chan(struct gsi_xfer_compl_evt *evt, if (callback) { if (unlikely(atomic_read(&ch_ctx->poll_mode))) { GSIERR("Calling client callback in polling mode\n"); - WARN_ON(1); } ch_ctx->props.xfer_cb(notify); } diff --git a/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c b/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c index 86c1c1b81a02..02ab28c5bc97 100644 --- a/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c +++ b/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c @@ -709,7 +709,8 @@ util_scan_parse_rnr_ie(struct scan_cache_entry *scan_entry, rnr_ie_len = ie->ie_len; data = (uint8_t *)ie + sizeof(struct ie_header); - while (data < ((uint8_t *)ie + rnr_ie_len + 2)) { + while ((data + sizeof(struct neighbor_ap_info_field)) <= + ((uint8_t *)ie + rnr_ie_len + 2)) { neighbor_ap_info = (struct neighbor_ap_info_field *)data; tbtt_count = neighbor_ap_info->tbtt_header.tbtt_info_count; tbtt_length = neighbor_ap_info->tbtt_header.tbtt_info_length; @@ -725,7 +726,8 @@ util_scan_parse_rnr_ie(struct scan_cache_entry *scan_entry, break; for (i = 0; i < (tbtt_count + 1) && - data < ((uint8_t *)ie + rnr_ie_len + 2); i++) { + (data + tbtt_length) <= + ((uint8_t *)ie + rnr_ie_len + 2); i++) { if (i < MAX_RNR_BSS) util_scan_update_rnr( &scan_entry->rnr.bss_info[i], diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 84188e180a2d..40e1fdee07f4 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -493,14 +493,6 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div, #define CPUFREQ_POLICY_POWERSAVE (1) #define CPUFREQ_POLICY_PERFORMANCE (2) -/* - * The polling frequency depends on the capability of the processor. Default - * polling frequency is 1000 times the transition latency of the processor. The - * ondemand governor will work on any processor with transition latency <= 10ms, - * using appropriate sampling rate. - */ -#define LATENCY_MULTIPLIER (1000) - struct cpufreq_governor { char name[CPUFREQ_NAME_LEN]; int (*init)(struct cpufreq_policy *policy); diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 215e65da1be5..7a4050b79ec4 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -12,14 +12,12 @@ #define SCHED_CPUFREQ_DL (1U << 1) #define SCHED_CPUFREQ_IOWAIT (1U << 2) #define SCHED_CPUFREQ_INTERCLUSTER_MIG (1U << 3) -#define SCHED_CPUFREQ_WALT (1U << 4) +#define SCHED_CPUFREQ_RESERVED (1U << 4) #define SCHED_CPUFREQ_PL (1U << 5) #define SCHED_CPUFREQ_EARLY_DET (1U << 6) #define SCHED_CPUFREQ_FORCE_UPDATE (1U << 7) #define SCHED_CPUFREQ_CONTINUE (1U << 8) -#define SCHED_CPUFREQ_RT_DL (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL) - #ifdef CONFIG_CPU_FREQ struct update_util_data { void (*func)(struct update_util_data *data, u64 time, unsigned int flags); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 2ee605775225..4091e5547a69 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -116,16 +116,16 @@ extern int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_updown_migrate_handler(struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos); + #ifdef CONFIG_UCLAMP_TASK extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif -extern int sched_updown_migrate_handler(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos); - extern int sysctl_numa_balancing(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e244363980f6..c68613e8115f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -46,6 +46,7 @@ #include "sched.h" #include "walt.h" +#include "tune.h" #include "../workqueue_internal.h" #include "../smpboot.h" @@ -801,7 +802,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; * This knob will not override the system default sched_util_clamp_min defined * above. */ -unsigned int sysctl_sched_uclamp_util_min_rt_default = 0; +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -837,12 +838,7 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); } -static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) -{ - return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); -} - -static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id) +static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -885,7 +881,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, } static inline -enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, +unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; @@ -1451,6 +1447,40 @@ static void uclamp_post_fork(struct task_struct *p) uclamp_update_util_min_rt_default(p); } +#ifdef CONFIG_SMP +unsigned int uclamp_task(struct task_struct *p) +{ + unsigned long util; + + util = task_util_est(p); + util = max(util, uclamp_eff_value(p, UCLAMP_MIN)); + util = min(util, uclamp_eff_value(p, UCLAMP_MAX)); + + return util; +} + +bool uclamp_boosted(struct task_struct *p) +{ + return uclamp_eff_value(p, UCLAMP_MIN) > 0; +} + +bool uclamp_latency_sensitive(struct task_struct *p) +{ +#ifdef CONFIG_UCLAMP_TASK_GROUP + struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id); + struct task_group *tg; + + if (!css) + return false; + tg = container_of(css, struct task_group, css); + + return tg->latency_sensitive; +#else + return false; +#endif +} +#endif /* CONFIG_SMP */ + static void __init init_uclamp_rq(struct rq *rq) { enum uclamp_id clamp_id; @@ -1502,6 +1532,41 @@ static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } + +long schedtune_task_margin(struct task_struct *task); + +#ifdef CONFIG_SMP +unsigned int uclamp_task(struct task_struct *p) +{ + unsigned long util = task_util_est(p); +#ifdef CONFIG_SCHED_TUNE + long margin = schedtune_task_margin(p); + + trace_sched_boost_task(p, util, margin); + + util += margin; +#endif + + return util; +} + +bool uclamp_boosted(struct task_struct *p) +{ +#ifdef CONFIG_SCHED_TUNE + return schedtune_task_boost(p) > 0; +#endif + return false; +} + +bool uclamp_latency_sensitive(struct task_struct *p) +{ +#ifdef CONFIG_SCHED_TUNE + return schedtune_prefer_idle(p) != 0; +#endif + return false; +} +#endif /* CONFIG_SMP */ + static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -2752,15 +2817,11 @@ void wake_up_if_idle(int cpu) if (!is_idle_task(rcu_dereference(rq->curr))) goto out; - if (set_nr_if_polling(rq->idle)) { - trace_sched_wake_idle_without_ipi(cpu); - } else { - rq_lock_irqsave(rq, &rf); - if (is_idle_task(rq->curr)) - arch_send_wakeup_ipi_mask(cpumask_of(cpu)); - /* Else CPU is not idle, do nothing here: */ - rq_unlock_irqrestore(rq, &rf); - } + rq_lock_irqsave(rq, &rf); + if (is_idle_task(rq->curr)) + resched_curr(rq); + /* Else CPU is not idle, do nothing here: */ + rq_unlock_irqrestore(rq, &rf); out: rcu_read_unlock(); @@ -3139,9 +3200,7 @@ out: if (success && sched_predl) { raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags); if (do_pl_notif(cpu_rq(cpu))) - cpufreq_update_util(cpu_rq(cpu), - SCHED_CPUFREQ_WALT | - SCHED_CPUFREQ_PL); + cpufreq_update_util(cpu_rq(cpu), SCHED_CPUFREQ_PL); raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags); } #endif @@ -4124,7 +4183,6 @@ void scheduler_tick(void) bool early_notif; u32 old_load; struct related_thread_group *grp; - unsigned int flag = 0; unsigned long thermal_pressure; sched_clock_tick(); @@ -4143,9 +4201,8 @@ void scheduler_tick(void) early_notif = early_detection_notify(rq, wallclock); if (early_notif) - flag = SCHED_CPUFREQ_WALT | SCHED_CPUFREQ_EARLY_DET; + cpufreq_update_util(rq, SCHED_CPUFREQ_EARLY_DET); - cpufreq_update_util(rq, flag); rq_unlock(rq, &rf); perf_event_task_tick(); @@ -8284,6 +8341,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) enum uclamp_id clamp_id; unsigned int clamps; + lockdep_assert_held(&uclamp_mutex); + SCHED_WARN_ON(!rcu_read_lock_held()); + css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent ? css_tg(css)->parent->uclamp : NULL; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 064e09359a6c..eb1df0c899a6 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -67,15 +67,9 @@ struct sugov_cpu { unsigned long util; unsigned long bw_min; - - /* The field below is for single-CPU policies only: */ -#ifdef CONFIG_NO_HZ_COMMON - unsigned long saved_idle_calls; -#endif }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); -static unsigned int stale_ns; static DEFINE_PER_CPU(struct sugov_tunables *, cached_tunables); /************************ Governor internals ***********************/ @@ -108,6 +102,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) return true; } + /* If the last frequency wasn't set yet then we can still amend it */ + if (sg_policy->work_in_progress) + return true; + /* No need to recalculate next freq for min_rate_limit_us * at least. However we might still decide to further rate * limit once frequency change direction is decided, according @@ -120,7 +118,11 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static inline bool use_pelt(void) { +#ifdef CONFIG_SCHED_WALT + return false; +#else return true; +#endif } static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time, @@ -248,9 +250,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return l_freq; } -extern long -schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p); - /* * This function computes an effective utilization for the given CPU, to be * used for frequency selection given the linear relation: f = u * f_max. @@ -357,11 +356,10 @@ unsigned long apply_dvfs_headroom(int cpu, unsigned long util, unsigned long max if (!util || util >= max_cap) return util; - if (cpumask_test_cpu(cpu, cpu_lp_mask)) { + if (cpumask_test_cpu(cpu, cpu_lp_mask)) headroom = util + (util >> 1); - } else { + else headroom = util + (util >> 2); - } return headroom; } @@ -513,19 +511,6 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; } -#ifdef CONFIG_NO_HZ_COMMON -static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) -{ - unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); - bool ret = idle_calls == sg_cpu->saved_idle_calls; - - sg_cpu->saved_idle_calls = idle_calls; - return ret; -} -#else -static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } -#endif /* CONFIG_NO_HZ_COMMON */ - /* * Make sugov_should_update_freq() ignore the rate limit when DL * has increased the utilization. @@ -543,8 +528,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long max_cap; unsigned int next_f; - bool busy; - unsigned long boost; + unsigned long boost; max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); @@ -556,25 +540,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; - /* Limits may have changed, don't skip frequency update */ - busy = use_pelt() && !sg_policy->need_freq_update && - sugov_cpu_is_busy(sg_cpu); - boost = sugov_iowait_apply(sg_cpu, time, max_cap); sugov_get_util(sg_cpu, boost); next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. - */ - if (busy && next_f < sg_policy->next_freq && - !sg_policy->need_freq_update) { - next_f = sg_policy->next_freq; - - /* Restore cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = sg_policy->prev_cached_raw_freq; - } /* * This code runs under rq->lock for the target CPU, so it won't run @@ -601,22 +570,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long boost; - s64 delta_ns; - - /* - * If the CPU utilization was last updated before the previous - * frequency update and the time elapsed between the last update - * of the CPU utilization and the last frequency update is long - * enough, don't take the CPU into account as it probably is - * idle now (and clear iowait_boost for it). - */ - delta_ns = time - j_sg_cpu->last_update; - if (delta_ns > stale_ns) { - sugov_iowait_reset(j_sg_cpu, time, false); - continue; - } - + unsigned long boost; boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); sugov_get_util(j_sg_cpu, boost); @@ -761,6 +715,28 @@ static struct attribute *sugov_attributes[] = { NULL }; +static void sugov_tunables_save(struct cpufreq_policy *policy, + struct sugov_tunables *tunables) +{ + int cpu; + struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu); + + if (!have_governor_per_policy()) + return; + + if (!cached) { + cached = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (!cached) + return; + + for_each_cpu(cpu, policy->related_cpus) + per_cpu(cached_tunables, cpu) = cached; + } + + cached->up_rate_limit_us = tunables->up_rate_limit_us; + cached->down_rate_limit_us = tunables->down_rate_limit_us; +} + static void sugov_tunables_free(struct kobject *kobj) { struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); @@ -768,6 +744,19 @@ static void sugov_tunables_free(struct kobject *kobj) kfree(to_sugov_tunables(attr_set)); } +static void sugov_tunables_restore(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + struct sugov_tunables *tunables = sg_policy->tunables; + struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu); + + if (!cached) + return; + + tunables->up_rate_limit_us = cached->up_rate_limit_us; + tunables->down_rate_limit_us = cached->down_rate_limit_us; +} + static struct kobj_type sugov_tunables_ktype = { .default_attrs = sugov_attributes, .sysfs_ops = &governor_sysfs_ops, @@ -825,7 +814,8 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + if (!policy->dvfs_possible_from_any_cpu) + kthread_bind_mask(thread, policy->related_cpus); init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -858,48 +848,12 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_save(struct cpufreq_policy *policy, - struct sugov_tunables *tunables) -{ - int cpu; - struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu); - - if (!have_governor_per_policy()) - return; - - if (!cached) { - cached = kzalloc(sizeof(*tunables), GFP_KERNEL); - if (!cached) - return; - - for_each_cpu(cpu, policy->related_cpus) - per_cpu(cached_tunables, cpu) = cached; - } - - cached->up_rate_limit_us = tunables->up_rate_limit_us; - cached->down_rate_limit_us = tunables->down_rate_limit_us; -} - - static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; } -static void sugov_tunables_restore(struct cpufreq_policy *policy) -{ - struct sugov_policy *sg_policy = policy->governor_data; - struct sugov_tunables *tunables = sg_policy->tunables; - struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu); - - if (!cached) - return; - - tunables->up_rate_limit_us = cached->up_rate_limit_us; - tunables->down_rate_limit_us = cached->down_rate_limit_us; -} - static int sugov_init(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy; @@ -948,8 +902,6 @@ static int sugov_init(struct cpufreq_policy *policy) policy->governor_data = sg_policy; sg_policy->tunables = tunables; - stale_ns = sched_ravg_window + (sched_ravg_window >> 3); - sugov_tunables_restore(policy); ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, @@ -989,13 +941,15 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_lock(&global_tunables_lock); + /* Save tunables before last owner release it in gov_attr_set_put() */ + if (tunables->attr_set.usage_count == 1) + sugov_tunables_save(policy, tunables); + count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; - - if (!count) { - sugov_tunables_save(policy, tunables); + if (!count) sugov_clear_global_tunables(); - } + mutex_unlock(&global_tunables_lock); sugov_kthread_stop(sg_policy); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index f37daebed44e..487e4fdf5055 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -55,6 +55,8 @@ static int convert_prio(int prio) * @cp: The cpupri context * @p: The task * @lowest_mask: A mask to fill in with selected CPUs (or NULL) + * @fitness_fn: A pointer to a function to do custom checks whether the CPU + * fits a specific criteria so that we only return those CPUs. * * Note: This function returns the recommended CPUs as calculated during the * current invocation. By the time the call returns, the CPUs may have in @@ -66,7 +68,8 @@ static int convert_prio(int prio) * Return: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask) + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)) { int idx = 0; int task_pri = convert_prio(p->prio); @@ -107,6 +110,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, continue; if (lowest_mask) { + int cpu; + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); cpumask_andnot(lowest_mask, lowest_mask, cpu_isolated_mask); @@ -119,7 +124,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * condition, simply act as though we never hit this * priority level and continue on. */ - if (cpumask_any(lowest_mask) >= nr_cpu_ids) + if (cpumask_empty(lowest_mask)) + continue; + + if (!fitness_fn) + return 1; + + /* Ensure the capacity of the CPUs fit the task */ + for_each_cpu(cpu, lowest_mask) { + if (!fitness_fn(p, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + } + + /* + * If no CPU at the current priority can fit the task + * continue looking + */ + if (cpumask_empty(lowest_mask)) continue; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index bab050019071..c08add835730 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -22,8 +22,9 @@ struct cpupri { }; #ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, - struct task_struct *p, struct cpumask *lowest_mask); +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 797fd81f470c..339ae3761ba6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4113,7 +4113,7 @@ static inline unsigned long _task_util_est(struct task_struct *p) return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED); } -static inline unsigned long task_util_est(struct task_struct *p) +unsigned long task_util_est(struct task_struct *p) { #ifdef CONFIG_SCHED_WALT if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) @@ -4132,7 +4132,7 @@ static inline unsigned long uclamp_task_util(struct task_struct *p) #else static inline unsigned long uclamp_task_util(struct task_struct *p) { - return boosted_task_util(p); + return task_util_est(p); } #endif @@ -6875,19 +6875,23 @@ schedtune_margin(unsigned long signal, long boost, long capacity) return margin; } -static inline int -schedtune_cpu_margin(unsigned long util, int cpu) +inline long +schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p) { - int boost = schedtune_cpu_boost(cpu); + int boost = schedtune_cpu_boost_with(cpu, p); + long margin; if (boost == 0) - return 0; + margin = 0; + else + margin = schedtune_margin(util, boost); - return schedtune_margin(util, boost, capacity_orig_of(cpu)); + trace_sched_boost_cpu(cpu, util, margin); + + return margin; } -static inline long -schedtune_task_margin(struct task_struct *task) +long schedtune_task_margin(struct task_struct *task) { int boost = schedtune_task_boost(task); unsigned long util; @@ -6904,50 +6908,14 @@ schedtune_task_margin(struct task_struct *task) #else /* CONFIG_SCHED_TUNE */ -static inline int -schedtune_cpu_margin(unsigned long util, int cpu) -{ - return 0; -} - -static inline int -schedtune_task_margin(struct task_struct *task) +inline long +schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p) { return 0; } #endif /* CONFIG_SCHED_TUNE */ -unsigned long -boosted_cpu_util(int cpu, struct sched_walt_cpu_load *walt_load) -{ - unsigned long util = cpu_util_freq(cpu, walt_load); - long margin = schedtune_cpu_margin(util, cpu); - - trace_sched_boost_cpu(cpu, util, margin); - - return util + margin; -} - -static inline unsigned long -boosted_task_util(struct task_struct *task) -{ -#ifdef CONFIG_UCLAMP_TASK_GROUP - unsigned long util = task_util_est(task); - unsigned long util_min = uclamp_eff_value(task, UCLAMP_MIN); - unsigned long util_max = uclamp_eff_value(task, UCLAMP_MAX); - - return clamp(util, util_min, util_max); -#else - unsigned long util = task_util_est(task); - long margin = schedtune_task_margin(task); - - trace_sched_boost_task(task, util, margin); - - return util + margin; -#endif -} - static unsigned long cpu_util_without(int cpu, struct task_struct *p); static unsigned long capacity_spare_without(int cpu, struct task_struct *p) @@ -7465,7 +7433,7 @@ static inline int select_idle_sibling_cstate_aware(struct task_struct *p, int pr continue; /* figure out if the task can fit here at all */ - new_usage = boosted_task_util(p); + new_usage = uclamp_task(p); capacity_orig = capacity_orig_of(i); if (new_usage > capacity_orig) @@ -7627,7 +7595,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, bool prefer_idle, struct find_best_target_env *fbt_env) { - unsigned long min_util = boosted_task_util(p); + unsigned long min_util = uclamp_task(p); unsigned long target_capacity = ULONG_MAX; unsigned long min_wake_util = ULONG_MAX; unsigned long target_max_spare_cap = 0; @@ -7724,10 +7692,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, if (sched_cpu_high_irqload(i)) continue; - /* Skip CPUs which do not fit task requirements */ - if (capacity_of(i) < boosted_task_util(p)) - continue; - /* * p's blocked utilization is still accounted for on prev_cpu * so prev_cpu will receive a negative bias due to the double @@ -8280,7 +8244,7 @@ static inline struct energy_env *get_eenv(struct task_struct *p, int prev_cpu) * util for group utilization calculations */ eenv->util_delta = task_util_est(p); - eenv->util_delta_boosted = boosted_task_util(p); + eenv->util_delta_boosted = uclamp_task(p); cpumask_and(&cpumask_possible_cpus, p->cpus_ptr, cpu_online_mask); eenv->max_cpu_count = cpumask_weight(&cpumask_possible_cpus); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d6ece8c45020..8cd41e4b39e9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -458,6 +458,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) return rt_se->on_rq; } +#ifdef CONFIG_UCLAMP_TASK +/* + * Verify the fitness of task @p to run on @cpu taking into account the uclamp + * settings. + * + * This check is only important for heterogeneous systems where uclamp_min value + * is higher than the capacity of a @cpu. For non-heterogeneous system this + * function will always return true. + * + * The function will return true if the capacity of the @cpu is >= the + * uclamp_min and false otherwise. + * + * Note that uclamp_min will be clamped to uclamp_max if uclamp_min + * > uclamp_max. + */ +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned int min_cap; + unsigned int max_cap; + unsigned int cpu_cap; + + /* Only heterogeneous systems can benefit from this check */ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return true; + + min_cap = uclamp_eff_value(p, UCLAMP_MIN); + max_cap = uclamp_eff_value(p, UCLAMP_MAX); + + cpu_cap = capacity_orig_of(cpu); + + return cpu_cap >= min(min_cap, max_cap); +} +#else +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + return true; +} +#endif + #ifdef CONFIG_RT_GROUP_SCHED static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) @@ -1481,6 +1520,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + bool test; /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1512,11 +1552,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. + * + * We take into account the capacity of the CPU to ensure it fits the + * requirement of the task - which is only important on heterogeneous + * systems like big.LITTLE. */ - if (energy_aware() || + test = energy_aware() || (curr && unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio))) { + curr->prio <= p->prio)); + + if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); /* @@ -1540,15 +1586,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) return; /* * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, NULL)) + if (p->nr_cpus_allowed != 1 && + cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) return; /* @@ -1706,7 +1752,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, p->cpus_ptr) && + rt_task_fits_capacity(p, cpu)) return 1; return 0; } @@ -1850,7 +1897,8 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, + rt_task_fits_capacity)) return -1; /* No targets found */ if (energy_aware()) @@ -2368,12 +2416,14 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && - !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio)) + bool need_to_push = !task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + p->nr_cpus_allowed > 1 && + (dl_task(rq->curr) || rt_task(rq->curr)) && + (rq->curr->nr_cpus_allowed < 2 || + rq->curr->prio <= p->prio); + + if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) push_rt_tasks(rq); } @@ -2446,7 +2496,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + bool need_to_push = rq->rt.overloaded || + !rt_task_fits_capacity(p, cpu_of(rq)); + + if (p->nr_cpus_allowed > 1 && need_to_push) queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55f476b2e201..2ef572842824 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -427,8 +427,6 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; /* Latency-sensitive flag used for a task group */ unsigned int latency_sensitive; - /* Boosted flag for a task group */ - unsigned int boosted; #endif }; @@ -953,15 +951,16 @@ struct rq { unsigned long nr_load_updates; u64 nr_switches; + struct cfs_rq cfs; + struct rt_rq rt; + struct dl_rq dl; + #ifdef CONFIG_UCLAMP_TASK /* Utilization clamp values based on CPU's RUNNABLE tasks */ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; unsigned int uclamp_flags; #define UCLAMP_FLAG_IDLE 0x01 #endif - struct cfs_rq cfs; - struct rt_rq rt; - struct dl_rq dl; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -2323,7 +2322,7 @@ cpu_util_freq_walt(int cpu, struct sched_walt_cpu_load *walt_load) static inline unsigned long cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load) { - return cpu_util_freq_walt(cpu, walt_load); + return min(cpu_util(cpu), capacity_orig_of(cpu)); } #else @@ -2333,91 +2332,6 @@ cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load) #endif /* CONFIG_SCHED_WALT */ -#ifdef CONFIG_SMP -static inline unsigned long cpu_util_cfs(struct rq *rq) -{ - unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); - - if (sched_feat(UTIL_EST)) { - util = max_t(unsigned long, util, - READ_ONCE(rq->cfs.avg.util_est.enqueued)); - } - - return util; -} -#endif - -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long *min, - unsigned long *max); - -unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, - unsigned long min, - unsigned long max); - -static inline unsigned long cpu_bw_dl(struct rq *rq) -{ - return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; -} - -static inline unsigned long cpu_util_dl(struct rq *rq) -{ - return READ_ONCE(rq->avg_dl.util_avg); -} - -static inline unsigned long cpu_util_rt(struct rq *rq) -{ - return READ_ONCE(rq->avg_rt.util_avg); -} -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - return 0; -} -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ - -#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -static inline unsigned long cpu_util_irq(struct rq *rq) -{ - return rq->avg_irq.util_avg; -} - -static inline -unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -{ - util *= (max - irq); - util /= max; - - return util; - -} -#else -static inline unsigned long cpu_util_irq(struct rq *rq) -{ - return 0; -} - -static inline -unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -{ - return util; -} -#endif - -static inline unsigned long cpu_util(int cpu) -{ - return min(__cpu_util(cpu) + cpu_util_rt(cpu_rq(cpu)), capacity_orig_of(cpu)); -} - -static inline unsigned long -cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load) -{ - return min(cpu_util(cpu), capacity_orig_of(cpu)); -} - extern unsigned int capacity_margin_freq; static inline unsigned long @@ -2760,8 +2674,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) u64 clock; #ifdef CONFIG_SCHED_WALT - if (!(flags & SCHED_CPUFREQ_WALT)) - return; clock = sched_ktime_clock(); #else clock = rq_clock(rq); @@ -2776,6 +2688,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_SCHED_WALT + +static inline bool +walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p) +{ + return cpu_of(rq) == task_cpu(p) && + (p->on_rq || p->last_sleep_ts >= rq->window_start); +} + +#endif /* CONFIG_SCHED_WALT */ + #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); @@ -2894,60 +2817,10 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) } #endif /* CONFIG_UCLAMP_TASK */ -#ifdef CONFIG_UCLAMP_TASK_GROUP -static inline bool uclamp_latency_sensitive(struct task_struct *p) -{ - struct cgroup_subsys_state *css = task_css(p, cpuset_cgrp_id); - struct task_group *tg; - - if (!css) - return false; - - if (!strlen(css->cgroup->kn->name)) - return 0; - - tg = container_of(css, struct task_group, css); - - return tg->latency_sensitive; -} - -static inline bool uclamp_boosted(struct task_struct *p) -{ - struct cgroup_subsys_state *css = task_css(p, cpuset_cgrp_id); - struct task_group *tg; - - if (!css) - return false; - - if (!strlen(css->cgroup->kn->name)) - return 0; - - tg = container_of(css, struct task_group, css); - - return tg->boosted; -} -#else -static inline bool uclamp_latency_sensitive(struct task_struct *p) -{ - return false; -} - -static inline bool uclamp_boosted(struct task_struct *p) -{ - return false; -} -#endif /* CONFIG_UCLAMP_TASK_GROUP */ - -#ifdef CONFIG_SCHED_WALT - -static inline bool -walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p) -{ - return cpu_of(rq) == task_cpu(p) && - (p->on_rq || p->last_sleep_ts >= rq->window_start); -} - -#endif /* CONFIG_SCHED_WALT */ +unsigned long task_util_est(struct task_struct *p); +unsigned int uclamp_task(struct task_struct *p); +bool uclamp_latency_sensitive(struct task_struct *p); +bool uclamp_boosted(struct task_struct *p); #ifdef arch_scale_freq_capacity #ifndef arch_scale_freq_invariant @@ -2957,6 +2830,89 @@ walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p) #define arch_scale_freq_invariant() (false) #endif +#ifdef CONFIG_SMP +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ + unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + + if (sched_feat(UTIL_EST)) { + util = max_t(unsigned long, util, + READ_ONCE(rq->cfs.avg.util_est.enqueued)); + } + + return util; +} +#endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long *min, + unsigned long *max); + +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max); + +static inline unsigned long cpu_bw_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); +} + +static inline unsigned long cpu_util_rt(struct rq *rq) +{ + return READ_ONCE(rq->avg_rt.util_avg); +} +#endif + +#ifdef CONFIG_SMP +#ifndef CONFIG_SCHED_WALT +static inline unsigned long cpu_util(int cpu) +{ + return min(__cpu_util(cpu) + cpu_util_rt(cpu_rq(cpu)), + capacity_orig_of(cpu)); +} +#endif +#endif + +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return rq->avg_irq.util_avg; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + util *= (max - irq); + util /= max; + + return util; + +} +#else +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return 0; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + return util; +} +#endif + +static inline unsigned long +cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load) +{ + return min(cpu_util(cpu), capacity_orig_of(cpu)); +} + enum sched_boost_policy { SCHED_BOOST_NONE, SCHED_BOOST_ON_BIG, diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index eb1028697665..5f5c94e2a1df 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -529,10 +529,11 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu) raw_spin_unlock_irqrestore(&bg->lock, irq_flags); } -int schedtune_cpu_boost(int cpu) +int schedtune_cpu_boost_with(int cpu, struct task_struct *p) { struct boost_groups *bg; u64 now; + int task_boost = p ? schedtune_task_boost(p) : -100; bg = &per_cpu(cpu_boost_groups, cpu); now = sched_clock_cpu(cpu); @@ -541,7 +542,7 @@ int schedtune_cpu_boost(int cpu) if (schedtune_boost_timeout(now, bg->boost_ts)) schedtune_cpu_update(cpu, now); - return bg->boost_max; + return max(bg->boost_max, task_boost); } int schedtune_task_boost(struct task_struct *p) diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 9508c151a42b..4ab18eddd8e6 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -12,7 +12,7 @@ struct target_nrg { struct reciprocal_value rdiv; }; -int schedtune_cpu_boost(int cpu); +int schedtune_cpu_boost_with(int cpu, struct task_struct *p); int schedtune_task_boost(struct task_struct *tsk); int schedtune_task_boost_rcu_locked(struct task_struct *tsk); @@ -23,7 +23,7 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #else /* CONFIG_SCHED_TUNE */ -#define schedtune_cpu_boost(cpu) 0 +#define schedtune_cpu_boost_with(cpu, p) 0 #define schedtune_task_boost(tsk) 0 #define schedtune_prefer_idle(tsk) 0 diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 2fedd010be1c..3577e39003f8 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -3220,7 +3220,7 @@ void walt_irq_work(struct irq_work *irq_work) cpu_online_mask); num_cpus = cpumask_weight(&cluster_online_cpus); for_each_cpu(cpu, &cluster_online_cpus) { - int flag = SCHED_CPUFREQ_WALT; + int flag = 0; rq = cpu_rq(cpu); diff --git a/kernel/smp.c b/kernel/smp.c index fd749ced516f..bac329f2b441 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -811,16 +811,12 @@ void wake_up_all_idle_cpus(void) { int cpu; - preempt_disable(); - for_each_online_cpu(cpu) { - if (cpu == smp_processor_id()) - continue; - - if (s2idle_state == S2IDLE_STATE_ENTER || - !cpu_isolated(cpu)) + for_each_possible_cpu(cpu) { + preempt_disable(); + if (cpu != smp_processor_id() && cpu_online(cpu)) wake_up_if_idle(cpu); + preempt_enable(); } - preempt_enable(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); diff --git a/kernel/softirq.c b/kernel/softirq.c index 4896a0eb178e..5f8ebcccbb0a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -77,21 +77,6 @@ static void wakeup_softirqd(void) wake_up_process(tsk); } -/* - * If ksoftirqd is scheduled, we do not want to process pending softirqs - * right now. Let ksoftirqd handle this at its own rate, to get fairness, - * unless we're doing some of the synchronous softirqs. - */ -#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) -static bool ksoftirqd_running(unsigned long pending) -{ - struct task_struct *tsk = __this_cpu_read(ksoftirqd); - - if (pending & SOFTIRQ_NOW_MASK) - return false; - return tsk && (tsk->state == TASK_RUNNING); -} - /* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving @@ -333,7 +318,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending && !ksoftirqd_running(pending)) + if (pending) do_softirq_own_stack(); local_irq_restore(flags); @@ -360,9 +345,6 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (ksoftirqd_running(local_softirq_pending())) - return; - if (!force_irqthreads) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /*