arch: arm64: configs: Enable ThinLTO

Signed-off-by: wulan17 <wulan17@nusantararom.org>
GPUCORE-36665 Fix OOB issue on KBASE_IOCTL_CS_TILER_HEAP_INIT
2025-12-17 09:33:40 +01:00 · 2025-12-14 18:28:47 +00:00 · 2025-12-14 18:26:14 +00:00 · 2025-12-14 18:16:52 +00:00 · 2025-12-14 18:01:37 +00:00 · 2025-12-14 18:01:17 +00:00
85 changed files with 2406 additions and 2543 deletions
--- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html
+++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html
@@ -485,7 +485,7 @@ section that the grace period must wait on.
 noted by <tt>rcu_node_context_switch()</tt> on the left.
 On the other hand, if the CPU takes a scheduler-clock interrupt
 while executing in usermode, a quiescent state will be noted by
-<tt>rcu_check_callbacks()</tt> on the right.
+<tt>rcu_sched_clock_irq()</tt> on the right.
 Either way, the passage through a quiescent state will be noted
 in a per-CPU variable.

@@ -651,7 +651,7 @@ to end.
 These callbacks are identified by <tt>rcu_advance_cbs()</tt>,
 which is usually invoked by <tt>__note_gp_changes()</tt>.
 As shown in the diagram below, this invocation can be triggered by
-the scheduling-clock interrupt (<tt>rcu_check_callbacks()</tt> on
+the scheduling-clock interrupt (<tt>rcu_sched_clock_irq()</tt> on
 the left) or by idle entry (<tt>rcu_cleanup_after_idle()</tt> on
 the right, but only for kernels build with
 <tt>CONFIG_RCU_FAST_NO_HZ=y</tt>).
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg
@@ -349,7 +349,7 @@
       font-weight="bold"
       font-size="192"
       id="text202-7-5"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text>
+       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text>
    <rect
       x="7069.6187"
       y="5087.4678"
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
@@ -3902,7 +3902,7 @@
         font-style="normal"
         y="-4418.6582"
         x="3745.7725"
-         xml:space="preserve">rcu_check_callbacks()</text>
+         xml:space="preserve">rcu_sched_clock_irq()</text>
    </g>
    <g
       transform="translate(-850.30204,55463.106)"
@@ -4968,7 +4968,7 @@
       font-weight="bold"
       font-size="192"
       id="text202-7-5-19"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text>
+       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text>
    <rect
       x="5314.2671"
       y="82817.688"
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg
@@ -775,7 +775,7 @@
         font-style="normal"
         y="-4418.6582"
         x="3745.7725"
-         xml:space="preserve">rcu_check_callbacks()</text>
+         xml:space="preserve">rcu_sched_clock_irq()</text>
    </g>
    <g
       transform="translate(399.7744,828.86448)"
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3771,7 +3771,9 @@
 				see CONFIG_RAS_CEC help text.

 	rcu_nocbs=	[KNL]
-			The argument is a cpu list, as described above.
+			The argument is a cpu list, as described above,
+			except that the string "all" can be used to
+			specify every CPU on the system.

 			In kernels built with CONFIG_RCU_NOCB_CPU=y, set
 			the specified list of CPUs to be no-callback CPUs.
--- a/arch/arm64/boot/dts/mediatek/mt6781.dts
+++ b/arch/arm64/boot/dts/mediatek/mt6781.dts
@@ -30,7 +30,6 @@
 	chosen: chosen {
 		bootargs = "root=/dev/ram \
 			vmalloc=400M swiotlb=noforce \
-			initcall_debug=1 \
 			firmware_class.path=/vendor/firmware \
 			page_owner=on quiet loop.max_part=7";
 			kaslr-seed = <0 0>;
--- a/arch/arm64/configs/fleur_defconfig
+++ b/arch/arm64/configs/fleur_defconfig
@@ -29,6 +29,7 @@ CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES="mediatek/mt6781"
 CONFIG_BUILD_ARM64_DTB_OVERLAY_IMAGE=y
 CONFIG_BUILD_ARM64_DTB_OVERLAY_IMAGE_NAMES="mediatek/fleur"
 CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y
+# CONFIG_PD_DBG_INFO is not set
 CONFIG_ENERGY_MODEL=y
 CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_STAT=y
@@ -41,6 +42,7 @@ CONFIG_MODULE_SRCVERSION_ALL=y
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_BLK_INLINE_ENCRYPTION=y
 CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y
+CONFIG_MQ_IOSCHED_KYBER=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_ZSMALLOC=y
 CONFIG_XFRM_MIGRATE=y
@@ -55,6 +57,7 @@ CONFIG_FW_LOADER_USER_HELPER=y
 CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y
 # CONFIG_FW_CACHE is not set
 CONFIG_ZRAM=y
+CONFIG_ZRAM_WRITEBACK=y
 CONFIG_BLK_DEV_LOOP_MIN_COUNT=16
 CONFIG_ANDROID_DEFAULT_SETTING=y
 CONFIG_MTK_ANDROID_DEFAULT_SETTING=y
@@ -67,6 +70,8 @@ CONFIG_MTK_MUSB_DUAL_ROLE=y
 CONFIG_MTK_PLATFORM="mt6785"
 CONFIG_ARCH_MTK_PROJECT="fleur"
 CONFIG_BLK_CGROUP=y
+# CONFIG_MEMCG is not set
+# CONFIG_MEMCG_SWAP is not set
 CONFIG_MTK_BATTERY_OC_POWER_THROTTLING=y
 CONFIG_MTK_BATTERY_PERCENTAGE_POWER_THROTTLING=y
 CONFIG_MTK_LOW_BATTERY_POWER_THROTTLING=y
@@ -226,6 +231,7 @@ CONFIG_PPP_MULTILINK=y
 CONFIG_PPPOE=y
 CONFIG_PPP_ASYNC=y
 CONFIG_PPP_SYNC_TTY=y
+CONFIG_POWERCAP=y
 CONFIG_USB_USBNET=y
 # CONFIG_KEYBOARD_ATKBD is not set
 CONFIG_KEYBOARD_MTK=y
@@ -401,7 +407,7 @@ CONFIG_CRYPTO_TWOFISH=y
 CONFIG_CRYPTO_BLAKE2B=y
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
 CONFIG_FRAME_WARN=2800
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -1,258 +1,131 @@
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2017 ARM Ltd
+ * All rights reserved.
 *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * and re-licensed under GPLv2 for the Linux kernel. The original code can
- * be found @
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
 *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+/* includes here */
 #include <linux/linkage.h>
 #include <asm/assembler.h>

-/*
-* compare memory areas(when two memory areas' offset are different,
-* alignment handled by the hardware)
-*
-* Parameters:
-*  x0 - const memory area 1 pointer
-*  x1 - const memory area 2 pointer
-*  x2 - the maximal compare byte length
-* Returns:
-*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
-*/
-
 /* Parameters and result.  */
-src1		.req	x0
-src2		.req	x1
-limit		.req	x2
-result		.req	x0
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		w0

 /* Internal variables.  */
-data1		.req	x3
-data1w		.req	w3
-data2		.req	x4
-data2w		.req	w4
-has_nul		.req	x5
-diff		.req	x6
-endloop		.req	x7
-tmp1		.req	x8
-tmp2		.req	x9
-tmp3		.req	x10
-pos		.req	x11
-limit_wd	.req	x12
-mask		.req	x13
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define tmp1		x5

+/* Small inputs of less than 8 bytes are handled separately.  This allows the
+   main code to be sped up using unaligned loads since there are now at least
+   8 bytes to be compared.  If the first 8 bytes are equal, align src1.
+   This ensures each iteration does at most one unaligned access even if both
+   src1 and src2 are unaligned, and mutually aligned inputs behave as if
+   aligned.  After the main loop, process the last 8 bytes using unaligned
+   accesses.  */
+
+.p2align 6
 WEAK(memcmp)
-	cbz	limit, .Lret0
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
-	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */
-	/*
-	* The input source addresses are at alignment boundary.
-	* Directly compare eight bytes each time.
-	*/
-.Lloop_aligned:
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-.Lstart_realigned:
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */
-	cbz	endloop, .Lloop_aligned
+	subs	limit, limit, 8
+	b.lo	.Lless8

-	/* Not reached the limit, must have found a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
+	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	and	tmp1, src1, 7
+	add	limit, limit, tmp1
+	cmp	data1, data2
+	bne	.Lreturn

-	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
-	ands	limit, limit, #7
-	b.eq	.Lnot_limit
-	/*
-	* The remained bytes less than 8. It is needed to extract valid data
-	* from last eight bytes of the intended memory range.
-	*/
-	lsl	limit, limit, #3	/* bytes-> bits.  */
-	mov	mask, #~0
-CPU_BE( lsr	mask, mask, limit )
-CPU_LE( lsl	mask, mask, limit )
-	bic	data1, data1, mask
-	bic	data2, data2, mask
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1

-	orr	diff, diff, mask
-	b	.Lnot_limit
+	subs	limit, limit, 8
+	b.ls	.Llast_bytes

-.Lmutual_align:
-	/*
-	* Sources are mutually aligned, but are not currently at an
-	* alignment boundary. Round down the addresses and then mask off
-	* the bytes that precede the start point.
-	*/
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	/*
-	* We can not add limit with alignment offset(tmp1) here. Since the
-	* addition probably make the limit overflown.
-	*/
-	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	add	tmp3, tmp3, tmp1
-	add	limit_wd, limit_wd, tmp3, lsr #3
-	add	limit, limit, tmp1/* Adjust the limit for the extra.  */
+	/* Loop performing 8 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 8 and must be larger than zero.
+	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	.p2align 4
+.Lloop8:
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	subs	limit, limit, 8
+	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
+	b.eq	.Lloop8

-	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
-	neg	tmp1, tmp1/* Bits to alignment -64.  */
-	mov	tmp2, #~0
-	/*mask off the non-intended bytes before the start address.*/
-CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
-	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp1 )
+	cmp	data1, data2
+	bne	.Lreturn

-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	b	.Lstart_realigned
+	/* Compare last 1-8 bytes using unaligned access.  */
+.Llast_bytes:
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]

-	/*src1 and src2 have different alignment offset.*/
-.Lmisaligned8:
-	cmp	limit, #8
-	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+.Lreturn:
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp     data1, data2
+.Lret_eq:
+	cset	result, ne
+	cneg	result, result, lo
+        ret

-	and	tmp1, src1, #7
-	neg	tmp1, tmp1
-	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
-	and	tmp2, src2, #7
-	neg	tmp2, tmp2
-	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
-	subs	tmp3, tmp1, tmp2
-	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/
-
-	sub	limit, limit, pos
-	/*compare the proceeding bytes in the first 8 byte segment.*/
-.Ltinycmp:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	pos, pos, #1
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
-	b.eq	.Ltinycmp
-	cbnz	pos, 1f /*diff occurred before the last byte.*/
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+.Lless8:
+	adds	limit, limit, 4
+	b.lo	.Lless4
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
-	b.eq	.Lstart_align
-1:
-	sub	result, data1, data2
-	ret
-
-.Lstart_align:
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-
-	ands	xzr, src1, #7
-	b.eq	.Lrecal_offset
-	/*process more leading bytes to make src1 aligned...*/
-	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/
-	add	src2, src2, tmp3
-	sub	limit, limit, tmp3
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-	/*load 8 bytes from aligned SRC1..*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2  /*Non-zero if differences found.*/
-	csinv	endloop, diff, xzr, ne
-	cbnz	endloop, .Lunequal_proc
-	/*How far is the current SRC2 from the alignment boundary...*/
-	and	tmp3, tmp3, #7
-
-.Lrecal_offset:/*src1 is aligned now..*/
-	neg	pos, tmp3
-.Lloopcmp_proc:
-	/*
-	* Divide the eight bytes into two parts. First,backwards the src2
-	* to an alignment boundary,load eight bytes and compare from
-	* the SRC2 alignment boundary. If all 8 bytes are equal,then start
-	* the second part's comparison. Otherwise finish the comparison.
-	* This special handle can garantee all the accesses are in the
-	* thread/task space in avoid to overrange access.
-	*/
-	ldr	data1, [src1,pos]
-	ldr	data2, [src2,pos]
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	cbnz	diff, .Lnot_limit
-
-	/*The second part process*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	subs	limit_wd, limit_wd, #1
-	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
-	cbz	endloop, .Lloopcmp_proc
-.Lunequal_proc:
-	cbz	diff, .Lremain8
-
-/* There is difference occurred in the latest comparison. */
-.Lnot_limit:
-/*
-* For little endian,reverse the low significant equal bits into MSB,then
-* following CLZ can find how many equal bits exist.
-*/
-CPU_LE( rev	diff, diff )
-CPU_LE( rev	data1, data1 )
-CPU_LE( rev	data2, data2 )
-
-	/*
-	* The MS-non-zero bit of DIFF marks either the first bit
-	* that is different, or the end of the significant data.
-	* Shifting left now will bring the critical information into the
-	* top bits.
-	*/
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/*
-	* We need to zero-extend (char is unsigned) the value and then
-	* perform a signed subtraction.
-	*/
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-
-.Lremain8:
-	/* Limit % 8 == 0 =>. all data are equal.*/
-	ands	limit, limit, #7
-	b.eq	.Lret0
-
-.Ltiny8proc:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */
-	b.eq	.Ltiny8proc
-	sub	result, data1, data2
-	ret
-.Lret0:
-	mov	result, #0
+	b.ne	.Lreturn
+	sub	limit, limit, 4
+.Lless4:
+	adds	limit, limit, 4
+	beq	.Lret_eq
+.Lbyte_loop:
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	.Lbyte_loop
+	sub	result, data1w, data2w
 	ret
 ENDPIPROC(memcmp)
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,6 +60,7 @@ tmp3		.req	x9
 zeroones	.req	x10
 pos		.req	x11

+	.p2align 6
 WEAK(strcmp)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -10,8 +10,7 @@

 #include "blk.h"

-static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
-		gfp_t gfp)
+struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
 {
 	struct bio *new = bio_alloc(gfp, nr_pages);

@@ -61,7 +60,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,

 		WARN_ON_ONCE((req_sects << 9) > UINT_MAX);

-		bio = next_bio(bio, 0, gfp_mask);
+		bio = blk_next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
 		bio_set_op_attrs(bio, op, 0);
@@ -155,7 +154,7 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	max_write_same_sectors = bio_allowed_max_sectors(q);

 	while (nr_sects) {
-		bio = next_bio(bio, 1, gfp_mask);
+		bio = blk_next_bio(bio, 1, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
 		bio->bi_vcnt = 1;
@@ -231,7 +230,7 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		return -EOPNOTSUPP;

 	while (nr_sects) {
-		bio = next_bio(bio, 0, gfp_mask);
+		bio = blk_next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
 		bio->bi_opf = REQ_OP_WRITE_ZEROES;
@@ -282,8 +281,8 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
 		return -EPERM;

 	while (nr_sects != 0) {
-		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
-			       gfp_mask);
+		bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
+				   gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -50,12 +50,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
 	return true;
 }

-static inline void blk_mq_sched_completed_request(struct request *rq)
+static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
 {
 	struct elevator_queue *e = rq->q->elevator;

 	if (e && e->type->ops.mq.completed_request)
-		e->type->ops.mq.completed_request(rq);
+		e->type->ops.mq.completed_request(rq, now);
 }

 static inline void blk_mq_sched_started_request(struct request *rq)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -527,6 +527,9 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 		blk_stat_add(rq, now);
 	}

+	if (rq->internal_tag != -1)
+		blk_mq_sched_completed_request(rq, now);
+
 	blk_account_io_done(rq, now);

 	if (rq->end_io) {
@@ -563,8 +566,6 @@ static void __blk_mq_complete_request(struct request *rq)

 	if (!blk_mq_mark_complete(rq))
 		return;
-	if (rq->internal_tag != -1)
-		blk_mq_sched_completed_request(rq);

 	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 		rq->q->softirq_done_fn(rq);
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -300,6 +300,11 @@ static ssize_t queue_zoned_show(struct request_queue *q, char *page)
 	}
 }

+static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(blk_queue_nr_zones(q), page);
+}
+
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
 {
 	return queue_var_show((blk_queue_nomerges(q) << 1) |
@@ -637,6 +642,11 @@ static struct queue_sysfs_entry queue_zoned_entry = {
 	.show = queue_zoned_show,
 };

+static struct queue_sysfs_entry queue_nr_zones_entry = {
+	.attr = {.name = "nr_zones", .mode = 0444 },
+	.show = queue_nr_zones_show,
+};
+
 static struct queue_sysfs_entry queue_nomerges_entry = {
 	.attr = {.name = "nomerges", .mode = 0644 },
 	.show = queue_nomerges_show,
@@ -727,6 +737,7 @@ static struct attribute *default_attrs[] = {
 	&queue_write_zeroes_max_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_zoned_entry.attr,
+	&queue_nr_zones_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -13,6 +13,8 @@
 #include <linux/rbtree.h>
 #include <linux/blkdev.h>

+#include "blk.h"
+
 static inline sector_t blk_zone_start(struct request_queue *q,
 				      sector_t sector)
 {
@@ -63,6 +65,33 @@ void __blk_req_zone_write_unlock(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);

+static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
+					     sector_t nr_sectors)
+{
+	unsigned long zone_sectors = blk_queue_zone_sectors(q);
+
+	return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
+}
+
+/**
+ * blkdev_nr_zones - Get number of zones
+ * @bdev:	Target block device
+ *
+ * Description:
+ *    Return the total number of zones of a zoned block device.
+ *    For a regular block device, the number of zones is always 0.
+ */
+unsigned int blkdev_nr_zones(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (!blk_queue_is_zoned(q))
+		return 0;
+
+	return __blkdev_nr_zones(q, bdev->bd_part->nr_sects);
+}
+EXPORT_SYMBOL_GPL(blkdev_nr_zones);
+
 /*
 * Check that a zone report belongs to the partition.
 * If yes, fix its start sector and write pointer, copy it in the
@@ -253,13 +282,13 @@ int blkdev_reset_zones(struct block_device *bdev,
 	struct bio *bio;
 	int ret;

-	if (!q)
-		return -ENXIO;
-
 	if (!blk_queue_is_zoned(q))
 		return -EOPNOTSUPP;

-	if (end_sector > bdev->bd_part->nr_sects)
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
+	if (!nr_sectors || end_sector > bdev->bd_part->nr_sects)
 		/* Out of range */
 		return -EINVAL;

@@ -272,19 +301,14 @@ int blkdev_reset_zones(struct block_device *bdev,
 	    end_sector != bdev->bd_part->nr_sects)
 		return -EINVAL;

+	blk_start_plug(&plug);
 	while (sector < end_sector) {

-		bio = bio_alloc(gfp_mask, 0);
+		bio = blk_next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
 		bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);

-		ret = submit_bio_wait(bio);
-		bio_put(bio);
-
-		if (ret)
-			return ret;
-
 		sector += zone_sectors;

 		/* This may take a while, so be nice to others */
@@ -292,7 +316,12 @@ int blkdev_reset_zones(struct block_device *bdev,

 	}

-	return 0;
+	ret = submit_bio_wait(bio);
+	bio_put(bio);
+
+	blk_finish_plug(&plug);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_reset_zones);

@@ -325,8 +354,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!rep.nr_zones)
 		return -EINVAL;

-	if (rep.nr_zones > INT_MAX / sizeof(struct blk_zone))
-		return -ERANGE;
+	rep.nr_zones = min(blkdev_nr_zones(bdev), rep.nr_zones);

 	zones = kvmalloc_array(rep.nr_zones, sizeof(struct blk_zone),
 			       GFP_KERNEL | __GFP_ZERO);
--- a/block/blk.h
+++ b/block/blk.h
@@ -438,4 +438,6 @@ extern int blk_iolatency_init(struct request_queue *q);
 static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
 #endif

+struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
+
 #endif /* BLK_INTERNAL_H */
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -537,6 +537,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
 	case BLKRESETZONE:
 		return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg);
+	case BLKGETZONESZ:
+		return put_uint(arg, bdev_zone_sectors(bdev));
+	case BLKGETNRZONES:
+		return put_uint(arg, blkdev_nr_zones(bdev));
 	case HDIO_GETGEO:
 		return blkdev_getgeo(bdev, argp);
 	case BLKRAGET:
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -29,19 +29,30 @@
 #include "blk-mq-debugfs.h"
 #include "blk-mq-sched.h"
 #include "blk-mq-tag.h"
-#include "blk-stat.h"

-/* Scheduling domains. */
+#define CREATE_TRACE_POINTS
+#include <trace/events/kyber.h>
+
+/*
+ * Scheduling domains: the device is divided into multiple domains based on the
+ * request type.
+ */
 enum {
 	KYBER_READ,
-	KYBER_SYNC_WRITE,
-	KYBER_OTHER, /* Async writes, discard, etc. */
+	KYBER_WRITE,
+	KYBER_DISCARD,
+	KYBER_OTHER,
 	KYBER_NUM_DOMAINS,
 };

-enum {
-	KYBER_MIN_DEPTH = 256,
+static const char *kyber_domain_names[] = {
+	[KYBER_READ] = "READ",
+	[KYBER_WRITE] = "WRITE",
+	[KYBER_DISCARD] = "DISCARD",
+	[KYBER_OTHER] = "OTHER",
+};

+enum {
 	/*
 	 * In order to prevent starvation of synchronous requests by a flood of
 	 * asynchronous requests, we reserve 25% of requests for synchronous
@@ -51,25 +62,87 @@ enum {
 };

 /*
- * Initial device-wide depths for each scheduling domain.
+ * Maximum device-wide depth for each scheduling domain.
 *
- * Even for fast devices with lots of tags like NVMe, you can saturate
- * the device with only a fraction of the maximum possible queue depth.
- * So, we cap these to a reasonable value.
+ * Even for fast devices with lots of tags like NVMe, you can saturate the
+ * device with only a fraction of the maximum possible queue depth. So, we cap
+ * these to a reasonable value.
 */
 static const unsigned int kyber_depth[] = {
 	[KYBER_READ] = 256,
-	[KYBER_SYNC_WRITE] = 128,
-	[KYBER_OTHER] = 64,
+	[KYBER_WRITE] = 128,
+	[KYBER_DISCARD] = 64,
+	[KYBER_OTHER] = 16,
 };

 /*
- * Scheduling domain batch sizes. We favor reads.
+ * Default latency targets for each scheduling domain.
+ */
+static const u64 kyber_latency_targets[] = {
+	[KYBER_READ] = 2ULL * NSEC_PER_MSEC,
+	[KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
+	[KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
+};
+
+/*
+ * Batch size (number of requests we'll dispatch in a row) for each scheduling
+ * domain.
 */
 static const unsigned int kyber_batch_size[] = {
 	[KYBER_READ] = 16,
-	[KYBER_SYNC_WRITE] = 8,
-	[KYBER_OTHER] = 8,
+	[KYBER_WRITE] = 8,
+	[KYBER_DISCARD] = 1,
+	[KYBER_OTHER] = 1,
+};
+
+/*
+ * Requests latencies are recorded in a histogram with buckets defined relative
+ * to the target latency:
+ *
+ * <= 1/4 * target latency
+ * <= 1/2 * target latency
+ * <= 3/4 * target latency
+ * <= target latency
+ * <= 1 1/4 * target latency
+ * <= 1 1/2 * target latency
+ * <= 1 3/4 * target latency
+ * > 1 3/4 * target latency
+ */
+enum {
+	/*
+	 * The width of the latency histogram buckets is
+	 * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
+	 */
+	KYBER_LATENCY_SHIFT = 2,
+	/*
+	 * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
+	 * thus, "good".
+	 */
+	KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
+	/* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
+	KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
+};
+
+/*
+ * We measure both the total latency and the I/O latency (i.e., latency after
+ * submitting to the device).
+ */
+enum {
+	KYBER_TOTAL_LATENCY,
+	KYBER_IO_LATENCY,
+};
+
+static const char *kyber_latency_type_names[] = {
+	[KYBER_TOTAL_LATENCY] = "total",
+	[KYBER_IO_LATENCY] = "I/O",
+};
+
+/*
+ * Per-cpu latency histograms: total latency and I/O latency for each scheduling
+ * domain except for KYBER_OTHER.
+ */
+struct kyber_cpu_latency {
+	atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
 };

 /*
@@ -88,12 +161,9 @@ struct kyber_ctx_queue {
 struct kyber_queue_data {
 	struct request_queue *q;

-	struct blk_stat_callback *cb;
-
 	/*
-	 * The device is divided into multiple scheduling domains based on the
-	 * request type. Each domain has a fixed number of in-flight requests of
-	 * that type device-wide, limited by these tokens.
+	 * Each scheduling domain has a limited number of in-flight requests
+	 * device-wide, limited by these tokens.
 	 */
 	struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];

@@ -103,8 +173,19 @@ struct kyber_queue_data {
 	 */
 	unsigned int async_depth;

+	struct kyber_cpu_latency __percpu *cpu_latency;
+
+	/* Timer for stats aggregation and adjusting domain tokens. */
+	struct timer_list timer;
+
+	unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
+
+	unsigned long latency_timeout[KYBER_OTHER];
+
+	int domain_p99[KYBER_OTHER];
+
 	/* Target latencies in nanoseconds. */
-	u64 read_lat_nsec, write_lat_nsec;
+	u64 latency_targets[KYBER_OTHER];
 };

 struct kyber_hctx_data {
@@ -124,233 +205,219 @@ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,

 static unsigned int kyber_sched_domain(unsigned int op)
 {
-	if ((op & REQ_OP_MASK) == REQ_OP_READ)
+	switch (op & REQ_OP_MASK) {
+	case REQ_OP_READ:
 		return KYBER_READ;
-	else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
-		return KYBER_SYNC_WRITE;
-	else
+	case REQ_OP_WRITE:
+		return KYBER_WRITE;
+	case REQ_OP_DISCARD:
+		return KYBER_DISCARD;
+	default:
 		return KYBER_OTHER;
+	}
 }

-enum {
-	NONE = 0,
-	GOOD = 1,
-	GREAT = 2,
-	BAD = -1,
-	AWFUL = -2,
-};
-
-#define IS_GOOD(status) ((status) > 0)
-#define IS_BAD(status) ((status) < 0)
-
-static int kyber_lat_status(struct blk_stat_callback *cb,
-			    unsigned int sched_domain, u64 target)
+static void flush_latency_buckets(struct kyber_queue_data *kqd,
+				  struct kyber_cpu_latency *cpu_latency,
+				  unsigned int sched_domain, unsigned int type)
 {
-	u64 latency;
+	unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
+	atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
+	unsigned int bucket;

-	if (!cb->stat[sched_domain].nr_samples)
-		return NONE;
-
-	latency = cb->stat[sched_domain].mean;
-	if (latency >= 2 * target)
-		return AWFUL;
-	else if (latency > target)
-		return BAD;
-	else if (latency <= target / 2)
-		return GREAT;
-	else /* (latency <= target) */
-		return GOOD;
+	for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
+		buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
 }

 /*
- * Adjust the read or synchronous write depth given the status of reads and
- * writes. The goal is that the latencies of the two domains are fair (i.e., if
- * one is good, then the other is good).
+ * Calculate the histogram bucket with the given percentile rank, or -1 if there
+ * aren't enough samples yet.
 */
-static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
-				  unsigned int sched_domain, int this_status,
-				  int other_status)
+static int calculate_percentile(struct kyber_queue_data *kqd,
+				unsigned int sched_domain, unsigned int type,
+				unsigned int percentile)
 {
-	unsigned int orig_depth, depth;
+	unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
+	unsigned int bucket, samples = 0, percentile_samples;
+
+	for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
+		samples += buckets[bucket];
+
+	if (!samples)
+		return -1;

 	/*
-	 * If this domain had no samples, or reads and writes are both good or
-	 * both bad, don't adjust the depth.
+	 * We do the calculation once we have 500 samples or one second passes
+	 * since the first sample was recorded, whichever comes first.
 	 */
-	if (this_status == NONE ||
-	    (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
-	    (IS_BAD(this_status) && IS_BAD(other_status)))
-		return;
-
-	orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
-
-	if (other_status == NONE) {
-		depth++;
-	} else {
-		switch (this_status) {
-		case GOOD:
-			if (other_status == AWFUL)
-				depth -= max(depth / 4, 1U);
-			else
-				depth -= max(depth / 8, 1U);
-			break;
-		case GREAT:
-			if (other_status == AWFUL)
-				depth /= 2;
-			else
-				depth -= max(depth / 4, 1U);
-			break;
-		case BAD:
-			depth++;
-			break;
-		case AWFUL:
-			if (other_status == GREAT)
-				depth += 2;
-			else
-				depth++;
-			break;
-		}
+	if (!kqd->latency_timeout[sched_domain])
+		kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
+	if (samples < 500 &&
+	    time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
+		return -1;
 	}
+	kqd->latency_timeout[sched_domain] = 0;

+	percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
+	for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
+		if (buckets[bucket] >= percentile_samples)
+			break;
+		percentile_samples -= buckets[bucket];
+	}
+	memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
+
+	trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain],
+			    kyber_latency_type_names[type], percentile,
+			    bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
+
+	return bucket;
+}
+
+static void kyber_resize_domain(struct kyber_queue_data *kqd,
+				unsigned int sched_domain, unsigned int depth)
+{
 	depth = clamp(depth, 1U, kyber_depth[sched_domain]);
-	if (depth != orig_depth)
+	if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
 		sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
+		trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain],
+				   depth);
+	}
 }

-/*
- * Adjust the depth of other requests given the status of reads and synchronous
- * writes. As long as either domain is doing fine, we don't throttle, but if
- * both domains are doing badly, we throttle heavily.
- */
-static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
-				     int read_status, int write_status,
-				     bool have_samples)
+static void kyber_timer_fn(struct timer_list *t)
 {
-	unsigned int orig_depth, depth;
-	int status;
+	struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
+	unsigned int sched_domain;
+	int cpu;
+	bool bad = false;

-	orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
+	/* Sum all of the per-cpu latency histograms. */
+	for_each_online_cpu(cpu) {
+		struct kyber_cpu_latency *cpu_latency;

-	if (read_status == NONE && write_status == NONE) {
-		depth += 2;
-	} else if (have_samples) {
-		if (read_status == NONE)
-			status = write_status;
-		else if (write_status == NONE)
-			status = read_status;
-		else
-			status = max(read_status, write_status);
-		switch (status) {
-		case GREAT:
-			depth += 2;
-			break;
-		case GOOD:
-			depth++;
-			break;
-		case BAD:
-			depth -= max(depth / 4, 1U);
-			break;
-		case AWFUL:
-			depth /= 2;
-			break;
+		cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
+		for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+			flush_latency_buckets(kqd, cpu_latency, sched_domain,
+					      KYBER_TOTAL_LATENCY);
+			flush_latency_buckets(kqd, cpu_latency, sched_domain,
+					      KYBER_IO_LATENCY);
 		}
 	}

-	depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
-	if (depth != orig_depth)
-		sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
-}
+	/*
+	 * Check if any domains have a high I/O latency, which might indicate
+	 * congestion in the device. Note that we use the p90; we don't want to
+	 * be too sensitive to outliers here.
+	 */
+	for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+		int p90;

-/*
- * Apply heuristics for limiting queue depths based on gathered latency
- * statistics.
- */
-static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
-{
-	struct kyber_queue_data *kqd = cb->data;
-	int read_status, write_status;
-
-	read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
-	write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
-
-	kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
-	kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
-	kyber_adjust_other_depth(kqd, read_status, write_status,
-				 cb->stat[KYBER_OTHER].nr_samples != 0);
+		p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
+					   90);
+		if (p90 >= KYBER_GOOD_BUCKETS)
+			bad = true;
+	}

 	/*
-	 * Continue monitoring latencies if we aren't hitting the targets or
-	 * we're still throttling other requests.
+	 * Adjust the scheduling domain depths. If we determined that there was
+	 * congestion, we throttle all domains with good latencies. Either way,
+	 * we ease up on throttling domains with bad latencies.
 	 */
-	if (!blk_stat_is_active(kqd->cb) &&
-	    ((IS_BAD(read_status) || IS_BAD(write_status) ||
-	      kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
-		blk_stat_activate_msecs(kqd->cb, 100);
+	for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+		unsigned int orig_depth, depth;
+		int p99;
+
+		p99 = calculate_percentile(kqd, sched_domain,
+					   KYBER_TOTAL_LATENCY, 99);
+		/*
+		 * This is kind of subtle: different domains will not
+		 * necessarily have enough samples to calculate the latency
+		 * percentiles during the same window, so we have to remember
+		 * the p99 for the next time we observe congestion; once we do,
+		 * we don't want to throttle again until we get more data, so we
+		 * reset it to -1.
+		 */
+		if (bad) {
+			if (p99 < 0)
+				p99 = kqd->domain_p99[sched_domain];
+			kqd->domain_p99[sched_domain] = -1;
+		} else if (p99 >= 0) {
+			kqd->domain_p99[sched_domain] = p99;
+		}
+		if (p99 < 0)
+			continue;
+
+		/*
+		 * If this domain has bad latency, throttle less. Otherwise,
+		 * throttle more iff we determined that there is congestion.
+		 *
+		 * The new depth is scaled linearly with the p99 latency vs the
+		 * latency target. E.g., if the p99 is 3/4 of the target, then
+		 * we throttle down to 3/4 of the current depth, and if the p99
+		 * is 2x the target, then we double the depth.
+		 */
+		if (bad || p99 >= KYBER_GOOD_BUCKETS) {
+			orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
+			depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
+			kyber_resize_domain(kqd, sched_domain, depth);
+		}
+	}
 }

-static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
+static unsigned int kyber_sched_tags_shift(struct request_queue *q)
 {
 	/*
 	 * All of the hardware queues have the same depth, so we can just grab
 	 * the shift of the first one.
 	 */
-	return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
-}
-
-static int kyber_bucket_fn(const struct request *rq)
-{
-	return kyber_sched_domain(rq->cmd_flags);
+	return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 }

 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 {
 	struct kyber_queue_data *kqd;
-	unsigned int max_tokens;
 	unsigned int shift;
 	int ret = -ENOMEM;
 	int i;

-	kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
+	kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
 	if (!kqd)
 		goto err;
+
 	kqd->q = q;

-	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
-					  KYBER_NUM_DOMAINS, kqd);
-	if (!kqd->cb)
+	kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
+					    GFP_KERNEL | __GFP_ZERO);
+	if (!kqd->cpu_latency)
 		goto err_kqd;

-	/*
-	 * The maximum number of tokens for any scheduling domain is at least
-	 * the queue depth of a single hardware queue. If the hardware doesn't
-	 * have many tags, still provide a reasonable number.
-	 */
-	max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
-			   KYBER_MIN_DEPTH);
+	timer_setup(&kqd->timer, kyber_timer_fn, 0);
+
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 		WARN_ON(!kyber_depth[i]);
 		WARN_ON(!kyber_batch_size[i]);
 		ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
-					      max_tokens, -1, false, GFP_KERNEL,
-					      q->node);
+					      kyber_depth[i], -1, false,
+					      GFP_KERNEL, q->node);
 		if (ret) {
 			while (--i >= 0)
 				sbitmap_queue_free(&kqd->domain_tokens[i]);
-			goto err_cb;
+			goto err_buckets;
 		}
-		sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
 	}

-	shift = kyber_sched_tags_shift(kqd);
-	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+	for (i = 0; i < KYBER_OTHER; i++) {
+		kqd->domain_p99[i] = -1;
+		kqd->latency_targets[i] = kyber_latency_targets[i];
+	}

-	kqd->read_lat_nsec = 2000000ULL;
-	kqd->write_lat_nsec = 10000000ULL;
+	shift = kyber_sched_tags_shift(q);
+	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;

 	return kqd;

-err_cb:
-	blk_stat_free_callback(kqd->cb);
+err_buckets:
+	free_percpu(kqd->cpu_latency);
 err_kqd:
 	kfree(kqd);
 err:
@@ -372,25 +439,24 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
 		return PTR_ERR(kqd);
 	}

+	blk_stat_enable_accounting(q);
+
 	eq->elevator_data = kqd;
 	q->elevator = eq;

-	blk_stat_add_callback(q, kqd->cb);
-
 	return 0;
 }

 static void kyber_exit_sched(struct elevator_queue *e)
 {
 	struct kyber_queue_data *kqd = e->elevator_data;
-	struct request_queue *q = kqd->q;
 	int i;

-	blk_stat_remove_callback(q, kqd->cb);
+	del_timer_sync(&kqd->timer);

 	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 		sbitmap_queue_free(&kqd->domain_tokens[i]);
-	blk_stat_free_callback(kqd->cb);
+	free_percpu(kqd->cpu_latency);
 	kfree(kqd);
 }

@@ -558,41 +624,44 @@ static void kyber_finish_request(struct request *rq)
 	rq_clear_domain_token(kqd, rq);
 }

-static void kyber_completed_request(struct request *rq)
+static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
+			       unsigned int sched_domain, unsigned int type,
+			       u64 target, u64 latency)
 {
-	struct request_queue *q = rq->q;
-	struct kyber_queue_data *kqd = q->elevator->elevator_data;
-	unsigned int sched_domain;
-	u64 now, latency, target;
+	unsigned int bucket;
+	u64 divisor;

-	/*
-	 * Check if this request met our latency goal. If not, quickly gather
-	 * some statistics and start throttling.
-	 */
-	sched_domain = kyber_sched_domain(rq->cmd_flags);
-	switch (sched_domain) {
-	case KYBER_READ:
-		target = kqd->read_lat_nsec;
-		break;
-	case KYBER_SYNC_WRITE:
-		target = kqd->write_lat_nsec;
-		break;
-	default:
-		return;
+	if (latency > 0) {
+		divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
+		bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
+			       KYBER_LATENCY_BUCKETS - 1);
+	} else {
+		bucket = 0;
 	}

-	/* If we are already monitoring latencies, don't check again. */
-	if (blk_stat_is_active(kqd->cb))
+	atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
+}
+
+static void kyber_completed_request(struct request *rq, u64 now)
+{
+	struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
+	struct kyber_cpu_latency *cpu_latency;
+	unsigned int sched_domain;
+	u64 target;
+
+	sched_domain = kyber_sched_domain(rq->cmd_flags);
+	if (sched_domain == KYBER_OTHER)
 		return;

-	now = ktime_get_ns();
-	if (now < rq->io_start_time_ns)
-		return;
+	cpu_latency = get_cpu_ptr(kqd->cpu_latency);
+	target = kqd->latency_targets[sched_domain];
+	add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
+			   target, now - rq->start_time_ns);
+	add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
+			   now - rq->io_start_time_ns);
+	put_cpu_ptr(kqd->cpu_latency);

-	latency = now - rq->io_start_time_ns;
-
-	if (latency > target)
-		blk_stat_activate_msecs(kqd->cb, 10);
+	timer_reduce(&kqd->timer, jiffies + HZ / 10);
 }

 struct flush_kcq_data {
@@ -713,6 +782,9 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 			rq_set_domain_token(rq, nr);
 			list_del_init(&rq->queuelist);
 			return rq;
+		} else {
+			trace_kyber_throttled(kqd->q,
+					      kyber_domain_names[khd->cur_domain]);
 		}
 	} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
 		nr = kyber_get_domain_token(kqd, khd, hctx);
@@ -723,6 +795,9 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 			rq_set_domain_token(rq, nr);
 			list_del_init(&rq->queuelist);
 			return rq;
+		} else {
+			trace_kyber_throttled(kqd->q,
+					      kyber_domain_names[khd->cur_domain]);
 		}
 	}

@@ -790,17 +865,17 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 	return false;
 }

-#define KYBER_LAT_SHOW_STORE(op)					\
-static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,		\
-				     char *page)			\
+#define KYBER_LAT_SHOW_STORE(domain, name)				\
+static ssize_t kyber_##name##_lat_show(struct elevator_queue *e,	\
+				       char *page)			\
 {									\
 	struct kyber_queue_data *kqd = e->elevator_data;		\
 									\
-	return sprintf(page, "%llu\n", kqd->op##_lat_nsec);		\
+	return sprintf(page, "%llu\n", kqd->latency_targets[domain]);	\
 }									\
 									\
-static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,		\
-				      const char *page, size_t count)	\
+static ssize_t kyber_##name##_lat_store(struct elevator_queue *e,	\
+					const char *page, size_t count)	\
 {									\
 	struct kyber_queue_data *kqd = e->elevator_data;		\
 	unsigned long long nsec;					\
@@ -810,12 +885,12 @@ static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,		\
 	if (ret)							\
 		return ret;						\
 									\
-	kqd->op##_lat_nsec = nsec;					\
+	kqd->latency_targets[domain] = nsec;				\
 									\
 	return count;							\
 }
-KYBER_LAT_SHOW_STORE(read);
-KYBER_LAT_SHOW_STORE(write);
+KYBER_LAT_SHOW_STORE(KYBER_READ, read);
+KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
 #undef KYBER_LAT_SHOW_STORE

 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
@@ -882,7 +957,8 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m)	\
 	return 0;							\
 }
 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
-KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
 #undef KYBER_DEBUGFS_DOMAIN_ATTRS

@@ -900,20 +976,7 @@ static int kyber_cur_domain_show(void *data, struct seq_file *m)
 	struct blk_mq_hw_ctx *hctx = data;
 	struct kyber_hctx_data *khd = hctx->sched_data;

-	switch (khd->cur_domain) {
-	case KYBER_READ:
-		seq_puts(m, "READ\n");
-		break;
-	case KYBER_SYNC_WRITE:
-		seq_puts(m, "SYNC_WRITE\n");
-		break;
-	case KYBER_OTHER:
-		seq_puts(m, "OTHER\n");
-		break;
-	default:
-		seq_printf(m, "%u\n", khd->cur_domain);
-		break;
-	}
+	seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
 	return 0;
 }

@@ -930,7 +993,8 @@ static int kyber_batching_show(void *data, struct seq_file *m)
 	{#name "_tokens", 0400, kyber_##name##_tokens_show}
 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
 	KYBER_QUEUE_DOMAIN_ATTRS(read),
-	KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
+	KYBER_QUEUE_DOMAIN_ATTRS(write),
+	KYBER_QUEUE_DOMAIN_ATTRS(discard),
 	KYBER_QUEUE_DOMAIN_ATTRS(other),
 	{"async_depth", 0400, kyber_async_depth_show},
 	{},
@@ -942,7 +1006,8 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
 	{#name "_waiting", 0400, kyber_##name##_waiting_show}
 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
 	KYBER_HCTX_DOMAIN_ATTRS(read),
-	KYBER_HCTX_DOMAIN_ATTRS(sync_write),
+	KYBER_HCTX_DOMAIN_ATTRS(write),
+	KYBER_HCTX_DOMAIN_ATTRS(discard),
 	KYBER_HCTX_DOMAIN_ATTRS(other),
 	{"cur_domain", 0400, kyber_cur_domain_show},
 	{"batching", 0400, kyber_batching_show},
--- a/drivers/gpu/mediatek/gpu_mali/mali_valhall/mali-r32p1/drivers/gpu/arm/midgard/csf/mali_kbase_csf.c
+++ b/drivers/gpu/mediatek/gpu_mali/mali_valhall/mali-r32p1/drivers/gpu/arm/midgard/csf/mali_kbase_csf.c
@@ -2762,6 +2762,33 @@ static void process_prfcnt_interrupts(struct kbase_device *kbdev, u32 glb_req,
 	}
 }

+static void order_job_irq_clear_with_iface_mem_read(void)
+{
+	/* Ensure that write to the JOB_IRQ_CLEAR is ordered with regards to the
+	 * read from interface memory. The ordering is needed considering the way
+	 * FW & Kbase writes to the JOB_IRQ_RAWSTAT and JOB_IRQ_CLEAR registers
+	 * without any synchronization. Without the barrier there is no guarantee
+	 * about the ordering, the write to IRQ_CLEAR can take effect after the read
+	 * from interface memory and that could cause a problem for the scenario where
+	 * FW sends back to back notifications for the same CSG for events like
+	 * SYNC_UPDATE and IDLE, but Kbase gets a single IRQ and observes only the
+	 * first event. Similar thing can happen with glb events like CFG_ALLOC_EN
+	 * acknowledgment and GPU idle notification.
+	 *
+	 *       MCU                                    CPU
+	 *  ---------------                         ----------------
+	 *  Update interface memory                 Write to IRQ_CLEAR to clear current IRQ
+	 *  <barrier>                               <barrier>
+	 *  Write to IRQ_RAWSTAT to raise new IRQ   Read interface memory
+	 */
+#if KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE
+	__iomb();
+#else
+	/* CPU and GPU would be in the same Outer shareable domain */
+	dmb(osh);
+#endif
+}
+
 void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val)
 {
 	unsigned long flags;
@@ -2771,6 +2798,7 @@ void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val)

 	KBASE_KTRACE_ADD(kbdev, CSF_INTERRUPT, NULL, val);
 	kbase_reg_write(kbdev, JOB_CONTROL_REG(JOB_IRQ_CLEAR), val);
+	order_job_irq_clear_with_iface_mem_read();

 	if (val & JOB_IRQ_GLOBAL_IF) {
 		const struct kbase_csf_global_iface *const global_iface =
--- a/drivers/gpu/mediatek/gpu_mali/mali_valhall/mali-r32p1/drivers/gpu/arm/midgard/csf/mali_kbase_csf_defs.h
+++ b/drivers/gpu/mediatek/gpu_mali/mali_valhall/mali-r32p1/drivers/gpu/arm/midgard/csf/mali_kbase_csf_defs.h
@@ -539,6 +539,8 @@ struct kbase_csf_cpu_queue_context {
 /**
 * struct kbase_csf_heap_context_allocator - Allocator of heap contexts
 *
+ * @heap_context_size_aligned: Size of a heap context structure, in bytes,
+ *                             aligned to GPU cacheline size.
 * Heap context structures are allocated by the kernel for use by the firmware.
 * The current implementation subdivides a single GPU memory region for use as
 * a sparse array.
@@ -560,6 +562,7 @@ struct kbase_csf_heap_context_allocator {
 	u64 gpu_va;
 	struct mutex lock;
 	DECLARE_BITMAP(in_use, MAX_TILER_HEAPS);
+	u32 heap_context_size_aligned;
 };

 /**
--- a/drivers/gpu/mediatek/gpu_mali/mali_valhall/mali-r32p1/drivers/gpu/arm/midgard/csf/mali_kbase_csf_heap_context_alloc.c
+++ b/drivers/gpu/mediatek/gpu_mali/mali_valhall/mali-r32p1/drivers/gpu/arm/midgard/csf/mali_kbase_csf_heap_context_alloc.c
@@ -23,10 +23,7 @@
 #include "mali_kbase_csf_heap_context_alloc.h"

 /* Size of one heap context structure, in bytes. */
-#define HEAP_CTX_SIZE ((size_t)32)
-
-/* Total size of the GPU memory region allocated for heap contexts, in bytes. */
-#define HEAP_CTX_REGION_SIZE (MAX_TILER_HEAPS * HEAP_CTX_SIZE)
+#define HEAP_CTX_SIZE ((u32)32)

 /**
 * sub_alloc - Sub-allocate a heap context from a GPU memory region
@@ -38,8 +35,8 @@
 static u64 sub_alloc(struct kbase_csf_heap_context_allocator *const ctx_alloc)
 {
 	struct kbase_context *const kctx = ctx_alloc->kctx;
-	int heap_nr = 0;
-	size_t ctx_offset = 0;
+	unsigned long heap_nr = 0;
+	u32 ctx_offset = 0;
 	u64 heap_gpu_va = 0;
 	struct kbase_vmap_struct mapping;
 	void *ctx_ptr = NULL;
@@ -55,24 +52,24 @@ static u64 sub_alloc(struct kbase_csf_heap_context_allocator *const ctx_alloc)
 		return 0;
 	}

-	ctx_offset = heap_nr * HEAP_CTX_SIZE;
+	ctx_offset = heap_nr * ctx_alloc->heap_context_size_aligned;
 	heap_gpu_va = ctx_alloc->gpu_va + ctx_offset;
 	ctx_ptr = kbase_vmap_prot(kctx, heap_gpu_va,
-		HEAP_CTX_SIZE, KBASE_REG_CPU_WR, &mapping);
+		ctx_alloc->heap_context_size_aligned, KBASE_REG_CPU_WR, &mapping);

 	if (unlikely(!ctx_ptr)) {
 		dev_err(kctx->kbdev->dev,
-			"Failed to map tiler heap context %d (0x%llX)\n",
+			"Failed to map tiler heap context %lu (0x%llX)\n",
 			heap_nr, heap_gpu_va);
 		return 0;
 	}

-	memset(ctx_ptr, 0, HEAP_CTX_SIZE);
+	memset(ctx_ptr, 0, ctx_alloc->heap_context_size_aligned);
 	kbase_vunmap(ctx_ptr, &mapping);

 	bitmap_set(ctx_alloc->in_use, heap_nr, 1);

-	dev_dbg(kctx->kbdev->dev, "Allocated tiler heap context %d (0x%llX)\n",
+	dev_dbg(kctx->kbdev->dev, "Allocated tiler heap context %lu (0x%llX)\n",
 		heap_nr, heap_gpu_va);

 	return heap_gpu_va;
@@ -88,7 +85,7 @@ static void sub_free(struct kbase_csf_heap_context_allocator *const ctx_alloc,
 	u64 const heap_gpu_va)
 {
 	struct kbase_context *const kctx = ctx_alloc->kctx;
-	u64 ctx_offset = 0;
+	u32 ctx_offset = 0;
 	unsigned int heap_nr = 0;

 	lockdep_assert_held(&ctx_alloc->lock);
@@ -99,15 +96,15 @@ static void sub_free(struct kbase_csf_heap_context_allocator *const ctx_alloc,
 	if (WARN_ON(heap_gpu_va < ctx_alloc->gpu_va))
 		return;

-	ctx_offset = heap_gpu_va - ctx_alloc->gpu_va;
+	ctx_offset = (u32)(heap_gpu_va - ctx_alloc->gpu_va);

-	if (WARN_ON(ctx_offset >= HEAP_CTX_REGION_SIZE) ||
-		WARN_ON(ctx_offset % HEAP_CTX_SIZE))
+	if (WARN_ON(ctx_offset >= (ctx_alloc->region->nr_pages << PAGE_SHIFT)) ||
+		WARN_ON(ctx_offset % ctx_alloc->heap_context_size_aligned))
 		return;

-	heap_nr = ctx_offset / HEAP_CTX_SIZE;
+	heap_nr = ctx_offset / ctx_alloc->heap_context_size_aligned;
 	dev_dbg(kctx->kbdev->dev,
-		"Freed tiler heap context %d (0x%llX)\n", heap_nr, heap_gpu_va);
+		"Freed tiler heap context %lu (0x%llX)\n", heap_nr, heap_gpu_va);

 	bitmap_clear(ctx_alloc->in_use, heap_nr, 1);
 }
@@ -116,12 +113,17 @@ int kbase_csf_heap_context_allocator_init(
 	struct kbase_csf_heap_context_allocator *const ctx_alloc,
 	struct kbase_context *const kctx)
 {
+	const u32 gpu_cache_line_size =
+		(1U << kctx->kbdev->gpu_props.props.l2_props.log2_line_size);
+
 	/* We cannot pre-allocate GPU memory here because the
 	 * custom VA zone may not have been created yet.
 	 */
 	ctx_alloc->kctx = kctx;
 	ctx_alloc->region = NULL;
 	ctx_alloc->gpu_va = 0;
+	ctx_alloc->heap_context_size_aligned =
+		(HEAP_CTX_SIZE + gpu_cache_line_size - 1) & ~(gpu_cache_line_size - 1);

 	mutex_init(&ctx_alloc->lock);
 	bitmap_zero(ctx_alloc->in_use, MAX_TILER_HEAPS);
@@ -156,7 +158,7 @@ u64 kbase_csf_heap_context_allocator_alloc(
 	struct kbase_context *const kctx = ctx_alloc->kctx;
 	u64 flags = BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
 		BASE_MEM_PROT_CPU_WR | BASEP_MEM_NO_USER_FREE;
-	u64 nr_pages = PFN_UP(HEAP_CTX_REGION_SIZE);
+	u64 nr_pages = PFN_UP(MAX_TILER_HEAPS * ctx_alloc->heap_context_size_aligned);
 	u64 heap_gpu_va = 0;

 #ifdef CONFIG_MALI_VECTOR_DUMP
--- a/drivers/gpu/mediatek/gpu_mali/mali_valhall/mali-r32p1/drivers/gpu/arm/midgard/mali_kbase_core_linux.c
+++ b/drivers/gpu/mediatek/gpu_mali/mali_valhall/mali-r32p1/drivers/gpu/arm/midgard/mali_kbase_core_linux.c
@@ -1488,8 +1488,8 @@ static int kbasep_cs_tiler_heap_init(struct kbase_context *kctx,
 {
 	if (heap_init->in.group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS)
 		return -EINVAL;
-
-	kctx->jit_group_id = heap_init->in.group_id;
+	else
+		kctx->jit_group_id = heap_init->in.group_id;

 	return kbase_csf_tiler_heap_init(kctx, heap_init->in.chunk_size,
 		heap_init->in.initial_chunks, heap_init->in.max_chunks,
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -725,8 +725,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
 	dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
 	dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);

-	dev->nr_zones = (dev->capacity + dev->zone_nr_sectors - 1)
-		>> dev->zone_nr_sectors_shift;
+	dev->nr_zones = blkdev_nr_zones(dev->bdev);

 	dmz->dev = dev;

--- a/drivers/misc/aw8622_haptic/aw8622_haptic.c
+++ b/drivers/misc/aw8622_haptic/aw8622_haptic.c
--- a/drivers/misc/aw8622_haptic/aw8622_haptic.h
+++ b/drivers/misc/aw8622_haptic/aw8622_haptic.h
@@ -1,93 +1,42 @@
 #ifndef _AW8622_HAPTIC_H_
 #define _AW8622_HAPTIC_H_

-struct aw8622_effect_state {
-	int effect_idx;
-	int duration;
-	int secs;
-	unsigned long nsces;
-	bool is_shock_stop;
-};
+#include <linux/hrtimer.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/pinctrl/consumer.h>

-struct waveform_data_info {
-	bool is_loaded;
-	const char *waveform_name;
-	unsigned int waveform_period; // The time of the whole waveform unit is ms
-	unsigned int sample_freq;
-	unsigned int sample_nums;
-	unsigned int us_time_len; //unit us	
-	unsigned int max_sample_val;
-	unsigned int len;
-	unsigned char *data;
-};
+#define AW_GPIO_MODE_LED_DEFAULT                (0)
+#define HAPTIC_GPIO_AW8622_DEFAULT              (0)
+#define HAPTIC_GPIO_AW8622_SET                  (1)
+#define HAPTIC_PWM_MEMORY_MODE_CLOCK            (26000000)
+#define HAPTIC_PWM_OLD_MODE_CLOCK               (26000000)
+
+#define DEFAULT_FREQUENCY                       (208)
+#define MIN_FREQUENCY                           (203)
+#define MAX_FREQUENCY                           (212)

 struct aw8622_haptic {
-
-	/* Hardware info */
-	unsigned int pwm_ch;
 	struct device *dev;
-	int hwen_gpio;
-	struct pinctrl *ppinctrl_pwm;
-
-	unsigned int default_pwm_freq;
-	unsigned int h_l_period;
-
-
-	/* Vibration waveform data field */ 
-	struct delayed_work load_waveform_work;
-	struct delayed_work hw_off_work;
-	unsigned int wave_sample_period; //wave sample period is ns
-	struct waveform_data_info *p_waveform_data;
-	int waveform_data_nums;
-	unsigned int wave_max_len;
-	bool is_malloc_wavedata_info;
-	int cur_load_idx;
-	unsigned int load_idx_offset;
-
-	bool is_malloc_dma_memory;
-	dma_addr_t wave_phy;
-	void *wave_vir;
-	unsigned dma_len;
-	
-
-	
-	spinlock_t	spin_lock;
-	
-
-	/* Vibration control field */
-	bool is_actived;
-	bool is_real_play;
-	bool is_power_on;
-	bool is_wavefrom_ready;
-
-	bool is_hwen;
-	int effect_idx;
-	unsigned int duration;
-	unsigned int interval;
-	unsigned int center_freq;
-
-	struct workqueue_struct *aw8622_wq;
+	struct hrtimer timer;
 	struct work_struct play_work;
 	struct work_struct stop_play_work;
-	struct work_struct test_work;
-	unsigned int test_cnt;
+	struct delayed_work hw_off_work;
+	struct workqueue_struct *aw8622_wq;
 	struct mutex mutex_lock;
-	struct hrtimer timer;
-	struct aw8622_effect_state effect_state;
+	struct pinctrl *ppinctrl_pwm;

+	int hwen_gpio;
+	unsigned int pwm_ch;
+	unsigned int duration;
+	unsigned int frequency;
+	unsigned int center_freq;
+	unsigned int default_pwm_freq;
+	unsigned int wave_sample_period;

+	bool is_power_on;
+	bool is_actived;
+	bool is_hwen;
 };

-#define LONG_SHOCK_BIT_NUMS_PER_SAMPLED_VALE	(80)
-
-#define WAVEFORM_DATA_OFFSET		(12)
-#define BIT_NUMS_PER_SAMPLED_VALE	(250)
-#define BIT_NUMS_PER_BYTE			(8)
-#define WAVEFORM_MAX_SAMPLE_VAL		(127)
-#define WAVEFORM_MIN_SAMPLE_VAL		(-127)
-
-#define MAX_NUMS_NONNEGATIVE_SIGNEC_8BIT	(128)	//The number of non-negative integers that a signed 8bit of data can represent
-#define MAX_NUMS_POSITIVE_SIGNEC_8BIT		(128)
-#define MAX_COUNT_SIGNEC_8BIT				(255)
-
-#endif
+#endif /* _AW8622_HAPTIC_H_ */
--- a/drivers/misc/mediatek/connectivity/wlan/core/gen4m/common/wlan_lib.c
+++ b/drivers/misc/mediatek/connectivity/wlan/core/gen4m/common/wlan_lib.c
@@ -7552,7 +7552,7 @@ void wlanInitFeatureOption(IN struct ADAPTER *prAdapter)
 	prWifiVar->ucApBandwidth = (uint8_t) wlanCfgGetUint32(
 				prAdapter, "ApBw", MAX_BW_160MHZ);
 	prWifiVar->ucAp2gBandwidth = (uint8_t) wlanCfgGetUint32(
-				prAdapter, "Ap2gBw", MAX_BW_20MHZ);
+				prAdapter, "Ap2gBw", MAX_BW_40MHZ);
 	prWifiVar->ucAp5gBandwidth = (uint8_t) wlanCfgGetUint32(
 				prAdapter, "Ap5gBw", MAX_BW_80MHZ);
 	prWifiVar->ucAp6gBandwidth = (uint8_t) wlanCfgGetUint32(
--- a/drivers/misc/mediatek/connectivity/wlan/core/gen4m/include/config.h
+++ b/drivers/misc/mediatek/connectivity/wlan/core/gen4m/include/config.h
@@ -1663,7 +1663,7 @@
 /* 1(default): Run on big core when tput over threshold
 * 0: Disable (Let system scheduler decide)
 */
-#define CFG_SUPPORT_TPUT_ON_BIG_CORE 1
+#define CFG_SUPPORT_TPUT_ON_BIG_CORE 0

 #define CFG_SUPPORT_LITTLE_CPU_BOOST 0

--- a/drivers/misc/mediatek/connectivity/wlan/core/gen4m/include/nic_cmd_event.h
+++ b/drivers/misc/mediatek/connectivity/wlan/core/gen4m/include/nic_cmd_event.h
@@ -1392,7 +1392,7 @@ struct CMD_ACCESS_RX_STAT {
 struct EVENT_ACCESS_RX_STAT {
 	uint32_t u4SeqNum;
 	uint32_t u4TotalNum;
-	uint32_t au4Buffer[1];
+	uint32_t au4Buffer[];
 };

 #if CFG_SUPPORT_TX_BF
--- a/drivers/misc/mediatek/connectivity/wlan/core/gen4m/mgmt/stats.c
+++ b/drivers/misc/mediatek/connectivity/wlan/core/gen4m/mgmt/stats.c
@@ -323,7 +323,11 @@ void statsParseARPInfo(struct sk_buff *skb,
 				IPV4TOSTR(&pucEthBody[ARP_SENDER_IP_OFFSET]));
 		break;
 	case EVENT_TX:
+#if BUILD_QA_DBG
 		DBGLOG(TX, INFO,
+#else
+		DBGLOG_LIMITED(TX, INFO,
+#endif
 			"ARP %s SRC MAC/IP["
 			MACSTR "]/[" IPV4STR "], TAR MAC/IP["
 			MACSTR "]/[" IPV4STR "], SeqNo: %d\n",
--- a/drivers/misc/mediatek/connectivity/wlan/core/gen4m/os/linux/gl_cfg80211.c
+++ b/drivers/misc/mediatek/connectivity/wlan/core/gen4m/os/linux/gl_cfg80211.c
@@ -1443,6 +1443,10 @@ int mtk_cfg80211_connect(struct wiphy *wiphy,
 			prWpaInfo->u4CipherPairwise =
 							IW_AUTH_CIPHER_CCMP;
 			break;
+#if CFG_SUPPORT_WAPI
+		case WLAN_CIPHER_SUITE_SMS4:
+			break;
+#endif
 		case WLAN_CIPHER_SUITE_BIP_GMAC_256:
 			prWpaInfo->u4CipherPairwise =
 							IW_AUTH_CIPHER_GCMP256;
@@ -1501,6 +1505,10 @@ int mtk_cfg80211_connect(struct wiphy *wiphy,
 			prWpaInfo->u4CipherGroup =
 							IW_AUTH_CIPHER_GCMP128;
 			break;
+#if CFG_SUPPORT_WAPI
+		case WLAN_CIPHER_SUITE_SMS4:
+			break;
+#endif
 		case WLAN_CIPHER_SUITE_NO_GROUP_ADDR:
 			break;
 		default:
--- a/drivers/misc/mediatek/connectivity/wlan/core/gen4m/os/linux/gl_init.c
+++ b/drivers/misc/mediatek/connectivity/wlan/core/gen4m/os/linux/gl_init.c
@@ -655,6 +655,9 @@ const uint32_t mtk_cipher_suites[] = {
 	WLAN_CIPHER_SUITE_WEP104,
 	WLAN_CIPHER_SUITE_TKIP,
 	WLAN_CIPHER_SUITE_CCMP,
+#if CFG_SUPPORT_WAPI
+	WLAN_CIPHER_SUITE_SMS4,
+#endif

 	/* keep last -- depends on hw flags! */
 	WLAN_CIPHER_SUITE_AES_CMAC,
@@ -1479,6 +1482,8 @@ static const struct wiphy_wowlan_support mtk_wlan_wowlan_support = {
 *******************************************************************************
 */

+static void wlanRemove(void);
+
 /*******************************************************************************
 *                              F U N C T I O N S
 *******************************************************************************
@@ -5667,6 +5672,8 @@ static int32_t wlanOnAtReset(void)
 		 * If WMT being removed in the future, you should invoke
 		 * wlanRemove directly from here
 		 */
+		kalSendAeeWarning("WFSYS", "wlanOnAtReset fail\n");
+		wlanRemove();
 #if 0
 		switch (eFailReason) {
 		case ADAPTER_START_FAIL:
@@ -6057,7 +6064,7 @@ wlanOffNotifyCfg80211Disconnect(IN struct GLUE_INFO *prGlueInfo)
 * \return (none)
 */
 /*----------------------------------------------------------------------------*/
-static void wlanRemove(void)
+void wlanRemove(void)
 {
 	struct net_device *prDev = NULL;
 	struct WLANDEV_INFO *prWlandevInfo = NULL;
--- a/drivers/misc/mediatek/connectivity/wlan/core/gen4m/os/linux/include/gl_p2p_ioctl.h
+++ b/drivers/misc/mediatek/connectivity/wlan/core/gen4m/os/linux/include/gl_p2p_ioctl.h
@@ -321,7 +321,7 @@ struct iw_p2p_version {
 extern struct ieee80211_supported_band mtk_band_2ghz;
 extern struct ieee80211_supported_band mtk_band_5ghz;

-extern const uint32_t mtk_cipher_suites[9];
+extern const uint32_t mtk_cipher_suites[];


 /******************************************************************************
--- a/drivers/misc/mediatek/performance/observer/platform/mt6768/qos_pfm.c
+++ b/drivers/misc/mediatek/performance/observer/platform/mt6768/qos_pfm.c
@@ -5,6 +5,8 @@
 #define pr_fmt(fmt) "pob_qos: " fmt
 #include <linux/notifier.h>
 #include <mt-plat/mtk_perfobserver.h>
+#include <linux/pm.h>
+#include <linux/suspend.h>

 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
@@ -27,7 +29,7 @@
 #include "obpfm_qos_bound.h"

 #define MS_TO_NS 1000000
-#define ADJUST_INTERVAL_MS 32
+#define ADJUST_INTERVAL_MS 64

 enum POBQOS_NTF_PUSH_TYPE {
 	POBQOS_NTF_TIMER = 0x00,
@@ -41,6 +43,24 @@ struct POBQOS_NTF_PUSH_TAG {
 	struct work_struct sWork;
 };

+static DEFINE_MUTEX(pob_timer_lock);
+static bool pob_timer_active;
+
+#ifdef CONFIG_PM_SLEEP
+static bool pob_timer_needs_restart;
+
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data);
+static struct notifier_block pob_pm_nb = {
+	.notifier_call = pob_pm_notifier_cb,
+};
+#endif
+
+static inline ktime_t pob_timer_interval(void)
+{
+	return ms_to_ktime(ADJUST_INTERVAL_MS);
+}
+
 static void pob_enable_timer(void);
 static void pob_disable_timer(void);

@@ -87,17 +107,63 @@ static struct hrtimer _pobqos_hrt;

 static void pob_enable_timer(void)
 {
-	ktime_t ktime;
+	ktime_t interval = pob_timer_interval();

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_start(&_pobqos_hrt, ktime, HRTIMER_MODE_REL);
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		mutex_unlock(&pob_timer_lock);
+		return;
+	}
+	pob_timer_active = true;
+	mutex_unlock(&pob_timer_lock);
+
+	hrtimer_start(&_pobqos_hrt, interval, HRTIMER_MODE_REL);
 }

 static void pob_disable_timer(void)
 {
-	hrtimer_cancel(&_pobqos_hrt);
+	bool should_cancel = false;
+
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		pob_timer_active = false;
+		should_cancel = true;
+	}
+	mutex_unlock(&pob_timer_lock);
+
+	if (should_cancel)
+		hrtimer_cancel(&_pobqos_hrt);
 }

+#ifdef CONFIG_PM_SLEEP
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	switch (action) {
+	case PM_SUSPEND_PREPARE:
+	case PM_HIBERNATION_PREPARE:
+		mutex_lock(&pob_timer_lock);
+		pob_timer_needs_restart = pob_timer_active;
+		mutex_unlock(&pob_timer_lock);
+
+		if (pob_timer_needs_restart)
+			pob_disable_timer();
+		break;
+	case PM_POST_SUSPEND:
+	case PM_POST_HIBERNATION:
+		if (pob_timer_needs_restart) {
+			pob_enable_timer();
+			pob_timer_needs_restart = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
 static void pobqos_hrt_wq_cb(struct work_struct *psWork)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush =
@@ -189,11 +255,13 @@ final:
 static enum hrtimer_restart pobqos_hrt_cb(struct hrtimer *timer)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush = NULL;
+	ktime_t interval;

-	ktime_t ktime;
+	if (!READ_ONCE(pob_timer_active))
+		return HRTIMER_NORESTART;

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_add_expires(timer, ktime);
+	interval = pob_timer_interval();
+	hrtimer_forward_now(timer, interval);

 	if (_gpPOBQoSNtfWQ)
 		vpPush =
@@ -280,12 +348,19 @@ int __init pob_qos_pfm_init(void)
 void __exit pob_qos_pfm_exit(void)
 {
 	unregister_qos_notifier(&pob_pfm_qos_notifier);
+
+#ifdef CONFIG_PM_SLEEP
+	register_pm_notifier(&pob_pm_nb);
+#endif
 }

 int pob_qos_pfm_enable(void)
 {
 	pob_qos_set_last_time_ms(0);
 	pob_enable_timer();
+#ifdef CONFIG_PM_SLEEP
+	unregister_pm_notifier(&pob_pm_nb);
+#endif

 	return 0;
 }
@@ -293,6 +368,9 @@ int pob_qos_pfm_enable(void)
 int pob_qos_pfm_disable(void)
 {
 	pob_disable_timer();
+#ifdef CONFIG_PM_SLEEP
+	pob_timer_needs_restart = false;
+#endif
 	pob_qos_set_last_time_ms(1);

 	return 0;
--- a/drivers/misc/mediatek/performance/observer/platform/mt6781/qos_pfm.c
+++ b/drivers/misc/mediatek/performance/observer/platform/mt6781/qos_pfm.c
@@ -5,6 +5,8 @@
 #define pr_fmt(fmt) "pob_qos: " fmt
 #include <linux/notifier.h>
 #include <mt-plat/mtk_perfobserver.h>
+#include <linux/pm.h>
+#include <linux/suspend.h>

 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
@@ -29,7 +31,7 @@
 #endif

 #define MS_TO_NS 1000000
-#define ADJUST_INTERVAL_MS 32
+#define ADJUST_INTERVAL_MS 64

 enum POBQOS_NTF_PUSH_TYPE {
 	POBQOS_NTF_TIMER = 0x00,
@@ -44,6 +46,24 @@ struct POBQOS_NTF_PUSH_TAG {
 };

 #ifdef CONFIG_MTK_QOS_FRAMEWORK
+static DEFINE_MUTEX(pob_timer_lock);
+static bool pob_timer_active;
+
+#ifdef CONFIG_PM_SLEEP
+static bool pob_timer_needs_restart;
+
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data);
+static struct notifier_block pob_pm_nb = {
+	.notifier_call = pob_pm_notifier_cb,
+};
+#endif
+
+static inline ktime_t pob_timer_interval(void)
+{
+	return ms_to_ktime(ADJUST_INTERVAL_MS);
+}
+
 static void pob_enable_timer(void);
 static void pob_disable_timer(void);

@@ -90,17 +110,63 @@ static struct hrtimer _pobqos_hrt;

 static void pob_enable_timer(void)
 {
-	ktime_t ktime;
+	ktime_t interval = pob_timer_interval();

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_start(&_pobqos_hrt, ktime, HRTIMER_MODE_REL);
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		mutex_unlock(&pob_timer_lock);
+		return;
+	}
+	pob_timer_active = true;
+	mutex_unlock(&pob_timer_lock);
+
+	hrtimer_start(&_pobqos_hrt, interval, HRTIMER_MODE_REL);
 }

 static void pob_disable_timer(void)
 {
-	hrtimer_cancel(&_pobqos_hrt);
+	bool should_cancel = false;
+
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		pob_timer_active = false;
+		should_cancel = true;
+	}
+	mutex_unlock(&pob_timer_lock);
+
+	if (should_cancel)
+		hrtimer_cancel(&_pobqos_hrt);
 }

+#ifdef CONFIG_PM_SLEEP
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	switch (action) {
+	case PM_SUSPEND_PREPARE:
+	case PM_HIBERNATION_PREPARE:
+		mutex_lock(&pob_timer_lock);
+		pob_timer_needs_restart = pob_timer_active;
+		mutex_unlock(&pob_timer_lock);
+
+		if (pob_timer_needs_restart)
+			pob_disable_timer();
+		break;
+	case PM_POST_SUSPEND:
+	case PM_POST_HIBERNATION:
+		if (pob_timer_needs_restart) {
+			pob_enable_timer();
+			pob_timer_needs_restart = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
 static void pobqos_hrt_wq_cb(struct work_struct *psWork)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush =
@@ -192,11 +258,13 @@ final:
 static enum hrtimer_restart pobqos_hrt_cb(struct hrtimer *timer)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush = NULL;
+	ktime_t interval;

-	ktime_t ktime;
+	if (!READ_ONCE(pob_timer_active))
+		return HRTIMER_NORESTART;

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_add_expires(timer, ktime);
+	interval = pob_timer_interval();
+	hrtimer_forward_now(timer, interval);

 	if (_gpPOBQoSNtfWQ)
 		vpPush =
@@ -275,11 +343,18 @@ int __init pob_qos_pfm_init(void)

 	register_qos_notifier(&pob_pfm_qos_notifier);

+#ifdef CONFIG_PM_SLEEP
+	register_pm_notifier(&pob_pm_nb);
+#endif
+
 	return 0;
 }

 void __exit pob_qos_pfm_exit(void)
 {
+#ifdef CONFIG_PM_SLEEP
+	unregister_pm_notifier(&pob_pm_nb);
+#endif
 	unregister_qos_notifier(&pob_pfm_qos_notifier);
 }

@@ -294,6 +369,9 @@ int pob_qos_pfm_enable(void)
 int pob_qos_pfm_disable(void)
 {
 	pob_disable_timer();
+#ifdef CONFIG_PM_SLEEP
+	pob_timer_needs_restart = false;
+#endif
 	pob_qos_set_last_time_ms(1);

 	return 0;
--- a/drivers/misc/mediatek/performance/observer/platform/mt6785/qos_pfm.c
+++ b/drivers/misc/mediatek/performance/observer/platform/mt6785/qos_pfm.c
@@ -5,6 +5,8 @@
 #define pr_fmt(fmt) "pob_qos: " fmt
 #include <linux/notifier.h>
 #include <mt-plat/mtk_perfobserver.h>
+#include <linux/pm.h>
+#include <linux/suspend.h>

 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
@@ -29,7 +31,7 @@
 #endif

 #define MS_TO_NS 1000000
-#define ADJUST_INTERVAL_MS 32
+#define ADJUST_INTERVAL_MS 64

 enum POBQOS_NTF_PUSH_TYPE {
 	POBQOS_NTF_TIMER = 0x00,
@@ -44,6 +46,24 @@ struct POBQOS_NTF_PUSH_TAG {
 };

 #ifdef CONFIG_MTK_QOS_FRAMEWORK
+static DEFINE_MUTEX(pob_timer_lock);
+static bool pob_timer_active;
+
+#ifdef CONFIG_PM_SLEEP
+static bool pob_timer_needs_restart;
+
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data);
+static struct notifier_block pob_pm_nb = {
+	.notifier_call = pob_pm_notifier_cb,
+};
+#endif
+
+static inline ktime_t pob_timer_interval(void)
+{
+	return ms_to_ktime(ADJUST_INTERVAL_MS);
+}
+
 static void pob_enable_timer(void);
 static void pob_disable_timer(void);

@@ -90,17 +110,63 @@ static struct hrtimer _pobqos_hrt;

 static void pob_enable_timer(void)
 {
-	ktime_t ktime;
+	ktime_t interval = pob_timer_interval();

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_start(&_pobqos_hrt, ktime, HRTIMER_MODE_REL);
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		mutex_unlock(&pob_timer_lock);
+		return;
+	}
+	pob_timer_active = true;
+	mutex_unlock(&pob_timer_lock);
+
+	hrtimer_start(&_pobqos_hrt, interval, HRTIMER_MODE_REL);
 }

 static void pob_disable_timer(void)
 {
-	hrtimer_cancel(&_pobqos_hrt);
+	bool should_cancel = false;
+
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		pob_timer_active = false;
+		should_cancel = true;
+	}
+	mutex_unlock(&pob_timer_lock);
+
+	if (should_cancel)
+		hrtimer_cancel(&_pobqos_hrt);
 }

+#ifdef CONFIG_PM_SLEEP
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	switch (action) {
+	case PM_SUSPEND_PREPARE:
+	case PM_HIBERNATION_PREPARE:
+		mutex_lock(&pob_timer_lock);
+		pob_timer_needs_restart = pob_timer_active;
+		mutex_unlock(&pob_timer_lock);
+
+		if (pob_timer_needs_restart)
+			pob_disable_timer();
+		break;
+	case PM_POST_SUSPEND:
+	case PM_POST_HIBERNATION:
+		if (pob_timer_needs_restart) {
+			pob_enable_timer();
+			pob_timer_needs_restart = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
 static void pobqos_hrt_wq_cb(struct work_struct *psWork)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush =
@@ -192,11 +258,13 @@ final:
 static enum hrtimer_restart pobqos_hrt_cb(struct hrtimer *timer)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush = NULL;
+	ktime_t interval;

-	ktime_t ktime;
+	if (!READ_ONCE(pob_timer_active))
+		return HRTIMER_NORESTART;

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_add_expires(timer, ktime);
+	interval = pob_timer_interval();
+	hrtimer_forward_now(timer, interval);

 	if (_gpPOBQoSNtfWQ)
 		vpPush =
@@ -275,11 +343,18 @@ int __init pob_qos_pfm_init(void)

 	register_qos_notifier(&pob_pfm_qos_notifier);

+#ifdef CONFIG_PM_SLEEP
+	register_pm_notifier(&pob_pm_nb);
+#endif
+
 	return 0;
 }

 void __exit pob_qos_pfm_exit(void)
 {
+#ifdef CONFIG_PM_SLEEP
+	unregister_pm_notifier(&pob_pm_nb);
+#endif
 	unregister_qos_notifier(&pob_pfm_qos_notifier);
 }

@@ -294,6 +369,9 @@ int pob_qos_pfm_enable(void)
 int pob_qos_pfm_disable(void)
 {
 	pob_disable_timer();
+#ifdef CONFIG_PM_SLEEP
+	pob_timer_needs_restart = false;
+#endif
 	pob_qos_set_last_time_ms(1);

 	return 0;
--- a/drivers/misc/mediatek/performance/observer/platform/mt6833/qos_pfm.c
+++ b/drivers/misc/mediatek/performance/observer/platform/mt6833/qos_pfm.c
@@ -5,6 +5,8 @@
 #define pr_fmt(fmt) "pob_qos: " fmt
 #include <linux/notifier.h>
 #include <mt-plat/mtk_perfobserver.h>
+#include <linux/pm.h>
+#include <linux/suspend.h>

 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
@@ -29,7 +31,7 @@
 #endif

 #define MS_TO_NS 1000000
-#define ADJUST_INTERVAL_MS 32
+#define ADJUST_INTERVAL_MS 64

 enum POBQOS_NTF_PUSH_TYPE {
 	POBQOS_NTF_TIMER = 0x00,
@@ -44,6 +46,24 @@ struct POBQOS_NTF_PUSH_TAG {
 };

 #ifdef CONFIG_MTK_QOS_FRAMEWORK
+static DEFINE_MUTEX(pob_timer_lock);
+static bool pob_timer_active;
+
+#ifdef CONFIG_PM_SLEEP
+static bool pob_timer_needs_restart;
+
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data);
+static struct notifier_block pob_pm_nb = {
+	.notifier_call = pob_pm_notifier_cb,
+};
+#endif
+
+static inline ktime_t pob_timer_interval(void)
+{
+	return ms_to_ktime(ADJUST_INTERVAL_MS);
+}
+
 static void pob_enable_timer(void);
 static void pob_disable_timer(void);

@@ -90,17 +110,63 @@ static struct hrtimer _pobqos_hrt;

 static void pob_enable_timer(void)
 {
-	ktime_t ktime;
+	ktime_t interval = pob_timer_interval();

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_start(&_pobqos_hrt, ktime, HRTIMER_MODE_REL);
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		mutex_unlock(&pob_timer_lock);
+		return;
+	}
+	pob_timer_active = true;
+	mutex_unlock(&pob_timer_lock);
+
+	hrtimer_start(&_pobqos_hrt, interval, HRTIMER_MODE_REL);
 }

 static void pob_disable_timer(void)
 {
-	hrtimer_cancel(&_pobqos_hrt);
+	bool should_cancel = false;
+
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		pob_timer_active = false;
+		should_cancel = true;
+	}
+	mutex_unlock(&pob_timer_lock);
+
+	if (should_cancel)
+		hrtimer_cancel(&_pobqos_hrt);
 }

+#ifdef CONFIG_PM_SLEEP
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	switch (action) {
+	case PM_SUSPEND_PREPARE:
+	case PM_HIBERNATION_PREPARE:
+		mutex_lock(&pob_timer_lock);
+		pob_timer_needs_restart = pob_timer_active;
+		mutex_unlock(&pob_timer_lock);
+
+		if (pob_timer_needs_restart)
+			pob_disable_timer();
+		break;
+	case PM_POST_SUSPEND:
+	case PM_POST_HIBERNATION:
+		if (pob_timer_needs_restart) {
+			pob_enable_timer();
+			pob_timer_needs_restart = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
 static void pobqos_hrt_wq_cb(struct work_struct *psWork)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush =
@@ -192,11 +258,13 @@ final:
 static enum hrtimer_restart pobqos_hrt_cb(struct hrtimer *timer)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush = NULL;
+	ktime_t interval;

-	ktime_t ktime;
+	if (!READ_ONCE(pob_timer_active))
+		return HRTIMER_NORESTART;

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_add_expires(timer, ktime);
+	interval = pob_timer_interval();
+	hrtimer_forward_now(timer, interval);

 	if (_gpPOBQoSNtfWQ)
 		vpPush =
@@ -275,11 +343,18 @@ int __init pob_qos_pfm_init(void)

 	register_qos_notifier(&pob_pfm_qos_notifier);

+#ifdef CONFIG_PM_SLEEP
+	register_pm_notifier(&pob_pm_nb);
+#endif
+
 	return 0;
 }

 void __exit pob_qos_pfm_exit(void)
 {
+#ifdef CONFIG_PM_SLEEP
+	unregister_pm_notifier(&pob_pm_nb);
+#endif
 	unregister_qos_notifier(&pob_pfm_qos_notifier);
 }

@@ -294,6 +369,9 @@ int pob_qos_pfm_enable(void)
 int pob_qos_pfm_disable(void)
 {
 	pob_disable_timer();
+#ifdef CONFIG_PM_SLEEP
+	pob_timer_needs_restart = false;
+#endif
 	pob_qos_set_last_time_ms(1);

 	return 0;
--- a/drivers/misc/mediatek/performance/observer/platform/mt6853/qos_pfm.c
+++ b/drivers/misc/mediatek/performance/observer/platform/mt6853/qos_pfm.c
@@ -5,6 +5,8 @@
 #define pr_fmt(fmt) "pob_qos: " fmt
 #include <linux/notifier.h>
 #include <mt-plat/mtk_perfobserver.h>
+#include <linux/pm.h>
+#include <linux/suspend.h>

 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
@@ -29,7 +31,7 @@
 #endif

 #define MS_TO_NS 1000000
-#define ADJUST_INTERVAL_MS 32
+#define ADJUST_INTERVAL_MS 64

 enum POBQOS_NTF_PUSH_TYPE {
 	POBQOS_NTF_TIMER = 0x00,
@@ -44,6 +46,24 @@ struct POBQOS_NTF_PUSH_TAG {
 };

 #ifdef CONFIG_MTK_QOS_FRAMEWORK
+static DEFINE_MUTEX(pob_timer_lock);
+static bool pob_timer_active;
+
+#ifdef CONFIG_PM_SLEEP
+static bool pob_timer_needs_restart;
+
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data);
+static struct notifier_block pob_pm_nb = {
+	.notifier_call = pob_pm_notifier_cb,
+};
+#endif
+
+static inline ktime_t pob_timer_interval(void)
+{
+	return ms_to_ktime(ADJUST_INTERVAL_MS);
+}
+
 static void pob_enable_timer(void);
 static void pob_disable_timer(void);

@@ -90,17 +110,63 @@ static struct hrtimer _pobqos_hrt;

 static void pob_enable_timer(void)
 {
-	ktime_t ktime;
+	ktime_t interval = pob_timer_interval();

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_start(&_pobqos_hrt, ktime, HRTIMER_MODE_REL);
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		mutex_unlock(&pob_timer_lock);
+		return;
+	}
+	pob_timer_active = true;
+	mutex_unlock(&pob_timer_lock);
+
+	hrtimer_start(&_pobqos_hrt, interval, HRTIMER_MODE_REL);
 }

 static void pob_disable_timer(void)
 {
-	hrtimer_cancel(&_pobqos_hrt);
+	bool should_cancel = false;
+
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		pob_timer_active = false;
+		should_cancel = true;
+	}
+	mutex_unlock(&pob_timer_lock);
+
+	if (should_cancel)
+		hrtimer_cancel(&_pobqos_hrt);
 }

+#ifdef CONFIG_PM_SLEEP
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	switch (action) {
+	case PM_SUSPEND_PREPARE:
+	case PM_HIBERNATION_PREPARE:
+		mutex_lock(&pob_timer_lock);
+		pob_timer_needs_restart = pob_timer_active;
+		mutex_unlock(&pob_timer_lock);
+
+		if (pob_timer_needs_restart)
+			pob_disable_timer();
+		break;
+	case PM_POST_SUSPEND:
+	case PM_POST_HIBERNATION:
+		if (pob_timer_needs_restart) {
+			pob_enable_timer();
+			pob_timer_needs_restart = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
 static void pobqos_hrt_wq_cb(struct work_struct *psWork)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush =
@@ -192,11 +258,13 @@ final:
 static enum hrtimer_restart pobqos_hrt_cb(struct hrtimer *timer)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush = NULL;
+	ktime_t interval;

-	ktime_t ktime;
+	if (!READ_ONCE(pob_timer_active))
+		return HRTIMER_NORESTART;

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_add_expires(timer, ktime);
+	interval = pob_timer_interval();
+	hrtimer_forward_now(timer, interval);

 	if (_gpPOBQoSNtfWQ)
 		vpPush =
@@ -275,11 +343,18 @@ int __init pob_qos_pfm_init(void)

 	register_qos_notifier(&pob_pfm_qos_notifier);

+#ifdef CONFIG_PM_SLEEP
+	register_pm_notifier(&pob_pm_nb);
+#endif
+
 	return 0;
 }

 void __exit pob_qos_pfm_exit(void)
 {
+#ifdef CONFIG_PM_SLEEP
+	unregister_pm_notifier(&pob_pm_nb);
+#endif
 	unregister_qos_notifier(&pob_pfm_qos_notifier);
 }

@@ -294,6 +369,9 @@ int pob_qos_pfm_enable(void)
 int pob_qos_pfm_disable(void)
 {
 	pob_disable_timer();
+#ifdef CONFIG_PM_SLEEP
+	pob_timer_needs_restart = false;
+#endif
 	pob_qos_set_last_time_ms(1);

 	return 0;
--- a/drivers/misc/mediatek/performance/observer/platform/mt6873/qos_pfm.c
+++ b/drivers/misc/mediatek/performance/observer/platform/mt6873/qos_pfm.c
@@ -5,6 +5,8 @@
 #define pr_fmt(fmt) "pob_qos: " fmt
 #include <linux/notifier.h>
 #include <mt-plat/mtk_perfobserver.h>
+#include <linux/pm.h>
+#include <linux/suspend.h>

 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
@@ -29,7 +31,7 @@
 #endif

 #define MS_TO_NS 1000000
-#define ADJUST_INTERVAL_MS 32
+#define ADJUST_INTERVAL_MS 64

 enum POBQOS_NTF_PUSH_TYPE {
 	POBQOS_NTF_TIMER = 0x00,
@@ -44,6 +46,24 @@ struct POBQOS_NTF_PUSH_TAG {
 };

 #ifdef CONFIG_MTK_QOS_FRAMEWORK
+static DEFINE_MUTEX(pob_timer_lock);
+static bool pob_timer_active;
+
+#ifdef CONFIG_PM_SLEEP
+static bool pob_timer_needs_restart;
+
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data);
+static struct notifier_block pob_pm_nb = {
+	.notifier_call = pob_pm_notifier_cb,
+};
+#endif
+
+static inline ktime_t pob_timer_interval(void)
+{
+	return ms_to_ktime(ADJUST_INTERVAL_MS);
+}
+
 static void pob_enable_timer(void);
 static void pob_disable_timer(void);

@@ -90,17 +110,63 @@ static struct hrtimer _pobqos_hrt;

 static void pob_enable_timer(void)
 {
-	ktime_t ktime;
+	ktime_t interval = pob_timer_interval();

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_start(&_pobqos_hrt, ktime, HRTIMER_MODE_REL);
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		mutex_unlock(&pob_timer_lock);
+		return;
+	}
+	pob_timer_active = true;
+	mutex_unlock(&pob_timer_lock);
+
+	hrtimer_start(&_pobqos_hrt, interval, HRTIMER_MODE_REL);
 }

 static void pob_disable_timer(void)
 {
-	hrtimer_cancel(&_pobqos_hrt);
+	bool should_cancel = false;
+
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		pob_timer_active = false;
+		should_cancel = true;
+	}
+	mutex_unlock(&pob_timer_lock);
+
+	if (should_cancel)
+		hrtimer_cancel(&_pobqos_hrt);
 }

+#ifdef CONFIG_PM_SLEEP
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	switch (action) {
+	case PM_SUSPEND_PREPARE:
+	case PM_HIBERNATION_PREPARE:
+		mutex_lock(&pob_timer_lock);
+		pob_timer_needs_restart = pob_timer_active;
+		mutex_unlock(&pob_timer_lock);
+
+		if (pob_timer_needs_restart)
+			pob_disable_timer();
+		break;
+	case PM_POST_SUSPEND:
+	case PM_POST_HIBERNATION:
+		if (pob_timer_needs_restart) {
+			pob_enable_timer();
+			pob_timer_needs_restart = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
 static void pobqos_hrt_wq_cb(struct work_struct *psWork)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush =
@@ -192,11 +258,13 @@ final:
 static enum hrtimer_restart pobqos_hrt_cb(struct hrtimer *timer)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush = NULL;
+	ktime_t interval;

-	ktime_t ktime;
+	if (!READ_ONCE(pob_timer_active))
+		return HRTIMER_NORESTART;

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_add_expires(timer, ktime);
+	interval = pob_timer_interval();
+	hrtimer_forward_now(timer, interval);

 	if (_gpPOBQoSNtfWQ)
 		vpPush =
@@ -275,11 +343,18 @@ int __init pob_qos_pfm_init(void)

 	register_qos_notifier(&pob_pfm_qos_notifier);

+#ifdef CONFIG_PM_SLEEP
+	register_pm_notifier(&pob_pm_nb);
+#endif
+
 	return 0;
 }

 void __exit pob_qos_pfm_exit(void)
 {
+#ifdef CONFIG_PM_SLEEP
+	unregister_pm_notifier(&pob_pm_nb);
+#endif
 	unregister_qos_notifier(&pob_pfm_qos_notifier);
 }

@@ -294,6 +369,9 @@ int pob_qos_pfm_enable(void)
 int pob_qos_pfm_disable(void)
 {
 	pob_disable_timer();
+#ifdef CONFIG_PM_SLEEP
+	pob_timer_needs_restart = false;
+#endif
 	pob_qos_set_last_time_ms(1);

 	return 0;
--- a/drivers/misc/mediatek/performance/observer/platform/mt6885/qos_pfm.c
+++ b/drivers/misc/mediatek/performance/observer/platform/mt6885/qos_pfm.c
@@ -5,6 +5,8 @@
 #define pr_fmt(fmt) "pob_qos: " fmt
 #include <linux/notifier.h>
 #include <mt-plat/mtk_perfobserver.h>
+#include <linux/pm.h>
+#include <linux/suspend.h>

 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
@@ -29,7 +31,7 @@
 #endif

 #define MS_TO_NS 1000000
-#define ADJUST_INTERVAL_MS 32
+#define ADJUST_INTERVAL_MS 64

 enum POBQOS_NTF_PUSH_TYPE {
 	POBQOS_NTF_TIMER = 0x00,
@@ -44,6 +46,24 @@ struct POBQOS_NTF_PUSH_TAG {
 };

 #ifdef CONFIG_MTK_QOS_FRAMEWORK
+static DEFINE_MUTEX(pob_timer_lock);
+static bool pob_timer_active;
+
+#ifdef CONFIG_PM_SLEEP
+static bool pob_timer_needs_restart;
+
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data);
+static struct notifier_block pob_pm_nb = {
+	.notifier_call = pob_pm_notifier_cb,
+};
+#endif
+
+static inline ktime_t pob_timer_interval(void)
+{
+	return ms_to_ktime(ADJUST_INTERVAL_MS);
+}
+
 static void pob_enable_timer(void);
 static void pob_disable_timer(void);

@@ -90,17 +110,63 @@ static struct hrtimer _pobqos_hrt;

 static void pob_enable_timer(void)
 {
-	ktime_t ktime;
+	ktime_t interval = pob_timer_interval();

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_start(&_pobqos_hrt, ktime, HRTIMER_MODE_REL);
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		mutex_unlock(&pob_timer_lock);
+		return;
+	}
+	pob_timer_active = true;
+	mutex_unlock(&pob_timer_lock);
+
+	hrtimer_start(&_pobqos_hrt, interval, HRTIMER_MODE_REL);
 }

 static void pob_disable_timer(void)
 {
-	hrtimer_cancel(&_pobqos_hrt);
+	bool should_cancel = false;
+
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		pob_timer_active = false;
+		should_cancel = true;
+	}
+	mutex_unlock(&pob_timer_lock);
+
+	if (should_cancel)
+		hrtimer_cancel(&_pobqos_hrt);
 }

+#ifdef CONFIG_PM_SLEEP
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	switch (action) {
+	case PM_SUSPEND_PREPARE:
+	case PM_HIBERNATION_PREPARE:
+		mutex_lock(&pob_timer_lock);
+		pob_timer_needs_restart = pob_timer_active;
+		mutex_unlock(&pob_timer_lock);
+
+		if (pob_timer_needs_restart)
+			pob_disable_timer();
+		break;
+	case PM_POST_SUSPEND:
+	case PM_POST_HIBERNATION:
+		if (pob_timer_needs_restart) {
+			pob_enable_timer();
+			pob_timer_needs_restart = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
 static void pobqos_hrt_wq_cb(struct work_struct *psWork)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush =
@@ -192,11 +258,13 @@ final:
 static enum hrtimer_restart pobqos_hrt_cb(struct hrtimer *timer)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush = NULL;
+	ktime_t interval;

-	ktime_t ktime;
+	if (!READ_ONCE(pob_timer_active))
+		return HRTIMER_NORESTART;

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_add_expires(timer, ktime);
+	interval = pob_timer_interval();
+	hrtimer_forward_now(timer, interval);

 	if (_gpPOBQoSNtfWQ)
 		vpPush =
@@ -275,11 +343,18 @@ int __init pob_qos_pfm_init(void)

 	register_qos_notifier(&pob_pfm_qos_notifier);

+#ifdef CONFIG_PM_SLEEP
+	register_pm_notifier(&pob_pm_nb);
+#endif
+
 	return 0;
 }

 void __exit pob_qos_pfm_exit(void)
 {
+#ifdef CONFIG_PM_SLEEP
+	unregister_pm_notifier(&pob_pm_nb);
+#endif
 	unregister_qos_notifier(&pob_pfm_qos_notifier);
 }

@@ -294,6 +369,9 @@ int pob_qos_pfm_enable(void)
 int pob_qos_pfm_disable(void)
 {
 	pob_disable_timer();
+#ifdef CONFIG_PM_SLEEP
+	pob_timer_needs_restart = false;
+#endif
 	pob_qos_set_last_time_ms(1);

 	return 0;
--- a/drivers/misc/mediatek/performance/observer/platform/mt6893/qos_pfm.c
+++ b/drivers/misc/mediatek/performance/observer/platform/mt6893/qos_pfm.c
@@ -5,6 +5,8 @@
 #define pr_fmt(fmt) "pob_qos: " fmt
 #include <linux/notifier.h>
 #include <mt-plat/mtk_perfobserver.h>
+#include <linux/pm.h>
+#include <linux/suspend.h>

 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
@@ -29,7 +31,7 @@
 #endif

 #define MS_TO_NS 1000000
-#define ADJUST_INTERVAL_MS 32
+#define ADJUST_INTERVAL_MS 64

 enum POBQOS_NTF_PUSH_TYPE {
 	POBQOS_NTF_TIMER = 0x00,
@@ -44,6 +46,24 @@ struct POBQOS_NTF_PUSH_TAG {
 };

 #ifdef CONFIG_MTK_QOS_FRAMEWORK
+static DEFINE_MUTEX(pob_timer_lock);
+static bool pob_timer_active;
+
+#ifdef CONFIG_PM_SLEEP
+static bool pob_timer_needs_restart;
+
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data);
+static struct notifier_block pob_pm_nb = {
+	.notifier_call = pob_pm_notifier_cb,
+};
+#endif
+
+static inline ktime_t pob_timer_interval(void)
+{
+	return ms_to_ktime(ADJUST_INTERVAL_MS);
+}
+
 static void pob_enable_timer(void);
 static void pob_disable_timer(void);

@@ -90,17 +110,63 @@ static struct hrtimer _pobqos_hrt;

 static void pob_enable_timer(void)
 {
-	ktime_t ktime;
+	ktime_t interval = pob_timer_interval();

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_start(&_pobqos_hrt, ktime, HRTIMER_MODE_REL);
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		mutex_unlock(&pob_timer_lock);
+		return;
+	}
+	pob_timer_active = true;
+	mutex_unlock(&pob_timer_lock);
+
+	hrtimer_start(&_pobqos_hrt, interval, HRTIMER_MODE_REL);
 }

 static void pob_disable_timer(void)
 {
-	hrtimer_cancel(&_pobqos_hrt);
+	bool should_cancel = false;
+
+	mutex_lock(&pob_timer_lock);
+	if (pob_timer_active) {
+		pob_timer_active = false;
+		should_cancel = true;
+	}
+	mutex_unlock(&pob_timer_lock);
+
+	if (should_cancel)
+		hrtimer_cancel(&_pobqos_hrt);
 }

+#ifdef CONFIG_PM_SLEEP
+static int pob_pm_notifier_cb(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	switch (action) {
+	case PM_SUSPEND_PREPARE:
+	case PM_HIBERNATION_PREPARE:
+		mutex_lock(&pob_timer_lock);
+		pob_timer_needs_restart = pob_timer_active;
+		mutex_unlock(&pob_timer_lock);
+
+		if (pob_timer_needs_restart)
+			pob_disable_timer();
+		break;
+	case PM_POST_SUSPEND:
+	case PM_POST_HIBERNATION:
+		if (pob_timer_needs_restart) {
+			pob_enable_timer();
+			pob_timer_needs_restart = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
 static void pobqos_hrt_wq_cb(struct work_struct *psWork)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush =
@@ -192,11 +258,13 @@ final:
 static enum hrtimer_restart pobqos_hrt_cb(struct hrtimer *timer)
 {
 	struct POBQOS_NTF_PUSH_TAG *vpPush = NULL;
+	ktime_t interval;

-	ktime_t ktime;
+	if (!READ_ONCE(pob_timer_active))
+		return HRTIMER_NORESTART;

-	ktime = ktime_set(0, ADJUST_INTERVAL_MS * MS_TO_NS);
-	hrtimer_add_expires(timer, ktime);
+	interval = pob_timer_interval();
+	hrtimer_forward_now(timer, interval);

 	if (_gpPOBQoSNtfWQ)
 		vpPush =
@@ -275,11 +343,18 @@ int __init pob_qos_pfm_init(void)

 	register_qos_notifier(&pob_pfm_qos_notifier);

+#ifdef CONFIG_PM_SLEEP
+	register_pm_notifier(&pob_pm_nb);
+#endif
+
 	return 0;
 }

 void __exit pob_qos_pfm_exit(void)
 {
+#ifdef CONFIG_PM_SLEEP
+	unregister_pm_notifier(&pob_pm_nb);
+#endif
 	unregister_qos_notifier(&pob_pfm_qos_notifier);
 }

@@ -294,6 +369,9 @@ int pob_qos_pfm_enable(void)
 int pob_qos_pfm_disable(void)
 {
 	pob_disable_timer();
+#ifdef CONFIG_PM_SLEEP
+	pob_timer_needs_restart = false;
+#endif
 	pob_qos_set_last_time_ms(1);

 	return 0;
--- a/drivers/misc/mediatek/typec/tcpc/tcpc_mt6360.c
+++ b/drivers/misc/mediatek/typec/tcpc/tcpc_mt6360.c
@@ -2121,9 +2121,9 @@ static int mt6360_set_bist_carrier_mode(struct tcpc_device *tcpc, u8 pattern)
 	return 0;
 }

-/* message header (2byte) + data object (7*4) */
+/* transmit count (1byte) + message header (2byte) + data object (7*4) */
 #define MT6360_TRANSMIT_MAX_SIZE \
-	(sizeof(u16) + sizeof(u32) * 7)
+	(1 + sizeof(u16) + sizeof(u32) * 7)

 #ifdef CONFIG_USB_PD_RETRY_CRC_DISCARD
 static int mt6360_retransmit(struct tcpc_device *tcpc)
--- a/drivers/misc/mediatek/typec/tcpc/tcpc_mt6362.c
+++ b/drivers/misc/mediatek/typec/tcpc/tcpc_mt6362.c
@@ -1576,8 +1576,8 @@ static int mt6362_get_message(struct tcpc_device *tcpc, u32 *payload,
 	return ret;
 }

-/* message header (2byte) + data object (7*4) */
-#define MT6362_TRANSMIT_MAX_SIZE	(sizeof(u16) + sizeof(u32) * 7)
+/* transmit count (1byte) + message header (2byte) + data object (7*4) */
+#define MT6362_TRANSMIT_MAX_SIZE	(1 + sizeof(u16) + sizeof(u32) * 7)

 static int mt6362_transmit(struct tcpc_device *tcpc,
 			   enum tcpm_transmit_type type, u16 header,
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -26,6 +26,7 @@
 * datablocks and metadata blocks.
 */

+#include <linux/blkdev.h>
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
@@ -40,44 +41,103 @@
 #include "page_actor.h"

 /*
- * Read the metadata block length, this is stored in the first two
- * bytes of the metadata block.
+ * Returns the amount of bytes copied to the page actor.
 */
-static struct buffer_head *get_block_length(struct super_block *sb,
-			u64 *cur_index, int *offset, int *length)
+static int copy_bio_to_actor(struct bio *bio,
+			     struct squashfs_page_actor *actor,
+			     int offset, int req_length)
 {
-	struct squashfs_sb_info *msblk = sb->s_fs_info;
-	struct buffer_head *bh;
+	void *actor_addr = squashfs_first_page(actor);
+	struct bvec_iter_all iter_all = {};
+	struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
+	int copied_bytes = 0;
+	int actor_offset = 0;

-	bh = sb_bread(sb, *cur_index);
-	if (bh == NULL)
-		return NULL;
+	if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all)))
+		return 0;

-	if (msblk->devblksize - *offset == 1) {
-		*length = (unsigned char) bh->b_data[*offset];
-		put_bh(bh);
-		bh = sb_bread(sb, ++(*cur_index));
-		if (bh == NULL)
-			return NULL;
-		*length |= (unsigned char) bh->b_data[0] << 8;
-		*offset = 1;
-	} else {
-		*length = (unsigned char) bh->b_data[*offset] |
-			(unsigned char) bh->b_data[*offset + 1] << 8;
-		*offset += 2;
+	while (copied_bytes < req_length) {
+		int bytes_to_copy = min_t(int, bvec->bv_len - offset,
+					  PAGE_SIZE - actor_offset);

-		if (*offset == msblk->devblksize) {
-			put_bh(bh);
-			bh = sb_bread(sb, ++(*cur_index));
-			if (bh == NULL)
-				return NULL;
-			*offset = 0;
+		bytes_to_copy = min_t(int, bytes_to_copy,
+				      req_length - copied_bytes);
+		memcpy(actor_addr + actor_offset,
+		       page_address(bvec->bv_page) + bvec->bv_offset + offset,
+		       bytes_to_copy);
+
+		actor_offset += bytes_to_copy;
+		copied_bytes += bytes_to_copy;
+		offset += bytes_to_copy;
+
+		if (actor_offset >= PAGE_SIZE) {
+			actor_addr = squashfs_next_page(actor);
+			if (!actor_addr)
+				break;
+			actor_offset = 0;
+		}
+		if (offset >= bvec->bv_len) {
+			if (!bio_next_segment(bio, &iter_all))
+				break;
+			offset = 0;
 		}
 	}
-
-	return bh;
+	squashfs_finish_page(actor);
+	return copied_bytes;
 }

+static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
+			     struct bio **biop, int *block_offset)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	const u64 read_start = round_down(index, msblk->devblksize);
+	const sector_t block = read_start >> msblk->devblksize_log2;
+	const u64 read_end = round_up(index + length, msblk->devblksize);
+	const sector_t block_end = read_end >> msblk->devblksize_log2;
+	int offset = read_start - round_down(index, PAGE_SIZE);
+	int total_len = (block_end - block) << msblk->devblksize_log2;
+	const int page_count = DIV_ROUND_UP(total_len + offset, PAGE_SIZE);
+	int error, i;
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOIO, page_count);
+	if (!bio)
+		return -ENOMEM;
+
+	bio_set_dev(bio, sb->s_bdev);
+	bio->bi_opf = READ;
+	bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
+
+	for (i = 0; i < page_count; ++i) {
+		unsigned int len =
+			min_t(unsigned int, PAGE_SIZE - offset, total_len);
+		struct page *page = alloc_page(GFP_NOIO);
+
+		if (!page) {
+			error = -ENOMEM;
+			goto out_free_bio;
+		}
+		if (!bio_add_page(bio, page, len, offset)) {
+			error = -EIO;
+			goto out_free_bio;
+		}
+		offset = 0;
+		total_len -= len;
+	}
+
+	error = submit_bio_wait(bio);
+	if (error)
+		goto out_free_bio;
+
+	*biop = bio;
+	*block_offset = index & ((1 << msblk->devblksize_log2) - 1);
+	return 0;
+
+out_free_bio:
+	bio_free_pages(bio);
+	bio_put(bio);
+	return error;
+}

 /*
 * Read and decompress a metadata block or datablock.  Length is non-zero
@@ -89,129 +149,88 @@ static struct buffer_head *get_block_length(struct super_block *sb,
 * algorithms).
 */
 int squashfs_read_data(struct super_block *sb, u64 index, int length,
-		u64 *next_index, struct squashfs_page_actor *output)
+		       u64 *next_index, struct squashfs_page_actor *output)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
-	struct buffer_head **bh;
-	int offset = index & ((1 << msblk->devblksize_log2) - 1);
-	u64 cur_index = index >> msblk->devblksize_log2;
-	int bytes, compressed, b = 0, k = 0, avail, i;
-
-	bh = kcalloc(((output->length + msblk->devblksize - 1)
-		>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
-	if (bh == NULL)
-		return -ENOMEM;
+	struct bio *bio = NULL;
+	int compressed;
+	int res;
+	int offset;

 	if (length) {
 		/*
 		 * Datablock.
 		 */
-		bytes = -offset;
 		compressed = SQUASHFS_COMPRESSED_BLOCK(length);
 		length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
-		if (next_index)
-			*next_index = index + length;
-
 		TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
 			index, compressed ? "" : "un", length, output->length);
-
-		if (length < 0 || length > output->length ||
-				(index + length) > msblk->bytes_used)
-			goto read_failure;
-
-		for (b = 0; bytes < length; b++, cur_index++) {
-			bh[b] = sb_getblk(sb, cur_index);
-			if (bh[b] == NULL)
-				goto block_release;
-			bytes += msblk->devblksize;
-		}
-		ll_rw_block(REQ_OP_READ, 0, b, bh);
 	} else {
 		/*
 		 * Metadata block.
 		 */
-		if ((index + 2) > msblk->bytes_used)
-			goto read_failure;
+		const u8 *data;
+		struct bvec_iter_all iter_all = {};
+		struct bio_vec *bvec = bvec_init_iter_all(&iter_all);

-		bh[0] = get_block_length(sb, &cur_index, &offset, &length);
-		if (bh[0] == NULL)
-			goto read_failure;
-		b = 1;
+		if (index + 2 > msblk->bytes_used) {
+			res = -EIO;
+			goto out;
+		}
+		res = squashfs_bio_read(sb, index, 2, &bio, &offset);
+		if (res)
+			goto out;
+
+		if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) {
+			res = -EIO;
+			goto out_free_bio;
+		}
+		/* Extract the length of the metadata block */
+		data = page_address(bvec->bv_page) + bvec->bv_offset;
+		length = data[offset];
+		if (offset <= bvec->bv_len - 1) {
+			length |= data[offset + 1] << 8;
+		} else {
+			if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) {
+				res = -EIO;
+				goto out_free_bio;
+			}
+			data = page_address(bvec->bv_page) + bvec->bv_offset;
+			length |= data[0] << 8;
+		}
+		bio_free_pages(bio);
+		bio_put(bio);

-		bytes = msblk->devblksize - offset;
 		compressed = SQUASHFS_COMPRESSED(length);
 		length = SQUASHFS_COMPRESSED_SIZE(length);
-		if (next_index)
-			*next_index = index + length + 2;
+		index += 2;

 		TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
-				compressed ? "" : "un", length);
-
-		if (length < 0 || length > output->length ||
-					(index + length) > msblk->bytes_used)
-			goto block_release;
-
-		for (; bytes < length; b++) {
-			bh[b] = sb_getblk(sb, ++cur_index);
-			if (bh[b] == NULL)
-				goto block_release;
-			bytes += msblk->devblksize;
-		}
-		ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1);
+		      compressed ? "" : "un", length);
 	}
+	if (next_index)
+		*next_index = index + length;

-	for (i = 0; i < b; i++) {
-		wait_on_buffer(bh[i]);
-		if (!buffer_uptodate(bh[i]))
-			goto block_release;
-	}
+	res = squashfs_bio_read(sb, index, length, &bio, &offset);
+	if (res)
+		goto out;

 	if (compressed) {
-		if (!msblk->stream)
-			goto read_failure;
-		length = squashfs_decompress(msblk, bh, b, offset, length,
-			output);
-		if (length < 0)
-			goto read_failure;
-	} else {
-		/*
-		 * Block is uncompressed.
-		 */
-		int in, pg_offset = 0;
-		void *data = squashfs_first_page(output);
-
-		for (bytes = length; k < b; k++) {
-			in = min(bytes, msblk->devblksize - offset);
-			bytes -= in;
-			while (in) {
-				if (pg_offset == PAGE_SIZE) {
-					data = squashfs_next_page(output);
-					pg_offset = 0;
-				}
-				avail = min_t(int, in, PAGE_SIZE -
-						pg_offset);
-				memcpy(data + pg_offset, bh[k]->b_data + offset,
-						avail);
-				in -= avail;
-				pg_offset += avail;
-				offset += avail;
-			}
-			offset = 0;
-			put_bh(bh[k]);
+		if (!msblk->stream) {
+			res = -EIO;
+			goto out_free_bio;
 		}
-		squashfs_finish_page(output);
+		res = squashfs_decompress(msblk, bio, offset, length, output);
+	} else {
+		res = copy_bio_to_actor(bio, output, offset, length);
 	}

-	kfree(bh);
-	return length;
+out_free_bio:
+	bio_free_pages(bio);
+	bio_put(bio);
+out:
+	if (res < 0)
+		ERROR("Failed to read block 0x%llx: %d\n", index, res);

-block_release:
-	for (; k < b; k++)
-		put_bh(bh[k]);
-
-read_failure:
-	ERROR("squashfs_read_data failed to read block 0x%llx\n",
-					(unsigned long long) index);
-	kfree(bh);
-	return -EIO;
+	return res;
 }
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -23,13 +23,14 @@
 * decompressor.h
 */

+#include <linux/bio.h>
+
 struct squashfs_decompressor {
 	void	*(*init)(struct squashfs_sb_info *, void *);
 	void	*(*comp_opts)(struct squashfs_sb_info *, void *, int);
 	void	(*free)(void *);
 	int	(*decompress)(struct squashfs_sb_info *, void *,
-		struct buffer_head **, int, int, int,
-		struct squashfs_page_actor *);
+		struct bio *, int, int, struct squashfs_page_actor *);
 	int	id;
 	char	*name;
 	int	supported;
--- a/fs/squashfs/decompressor_multi.c
+++ b/fs/squashfs/decompressor_multi.c
@@ -8,7 +8,7 @@
 #include <linux/types.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/cpumask.h>
@@ -182,14 +182,15 @@ wait:
 }


-int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
-	int b, int offset, int length, struct squashfs_page_actor *output)
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+			int offset, int length,
+			struct squashfs_page_actor *output)
 {
 	int res;
 	struct squashfs_stream *stream = msblk->stream;
 	struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream);
 	res = msblk->decompressor->decompress(msblk, decomp_stream->stream,
-		bh, b, offset, length, output);
+		bio, offset, length, output);
 	put_decomp_stream(decomp_stream, stream);
 	if (res < 0)
 		ERROR("%s decompression failed, data probably corrupt\n",
--- a/fs/squashfs/decompressor_multi_percpu.c
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -74,14 +74,17 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
 	}
 }

-int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
-	int b, int offset, int length, struct squashfs_page_actor *output)
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+	int offset, int length, struct squashfs_page_actor *output)
 {
-	struct squashfs_stream __percpu *percpu =
-			(struct squashfs_stream __percpu *) msblk->stream;
-	struct squashfs_stream *stream = get_cpu_ptr(percpu);
-	int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
-		offset, length, output);
+	struct squashfs_stream __percpu *percpu;
+	struct squashfs_stream *stream;
+	int res;
+
+	percpu = (struct squashfs_stream __percpu *)msblk->stream;
+	stream = get_cpu_ptr(percpu);
+	res = msblk->decompressor->decompress(msblk, stream->stream, bio,
+					      offset, length, output);
 	put_cpu_ptr(stream);

 	if (res < 0)
--- a/fs/squashfs/decompressor_single.c
+++ b/fs/squashfs/decompressor_single.c
@@ -9,7 +9,7 @@
 #include <linux/types.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>

 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -61,14 +61,15 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
 	}
 }

-int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
-	int b, int offset, int length, struct squashfs_page_actor *output)
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+			int offset, int length,
+			struct squashfs_page_actor *output)
 {
 	int res;
 	struct squashfs_stream *stream = msblk->stream;

 	mutex_lock(&stream->mutex);
-	res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+	res = msblk->decompressor->decompress(msblk, stream->stream, bio,
 		offset, length, output);
 	mutex_unlock(&stream->mutex);

--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -6,7 +6,7 @@
 * the COPYING file in the top-level directory.
 */

-#include <linux/buffer_head.h>
+#include <linux/bio.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
@@ -91,20 +91,23 @@ static void lz4_free(void *strm)


 static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
-	struct buffer_head **bh, int b, int offset, int length,
+	struct bio *bio, int offset, int length,
 	struct squashfs_page_actor *output)
 {
+	struct bvec_iter_all iter_all = {};
+	struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
 	struct squashfs_lz4 *stream = strm;
 	void *buff = stream->input, *data;
-	int avail, i, bytes = length, res;
+	int bytes = length, res;

-	for (i = 0; i < b; i++) {
-		avail = min(bytes, msblk->devblksize - offset);
-		memcpy(buff, bh[i]->b_data + offset, avail);
+	while (bio_next_segment(bio, &iter_all)) {
+		int avail = min(bytes, ((int)bvec->bv_len) - offset);
+
+		data = page_address(bvec->bv_page) + bvec->bv_offset;
+		memcpy(buff, data + offset, avail);
 		buff += avail;
 		bytes -= avail;
 		offset = 0;
-		put_bh(bh[i]);
 	}

 	res = LZ4_decompress_safe(stream->input, stream->output,
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -22,7 +22,7 @@
 */

 #include <linux/mutex.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/lzo.h>
@@ -76,21 +76,24 @@ static void lzo_free(void *strm)


 static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
-	struct buffer_head **bh, int b, int offset, int length,
+	struct bio *bio, int offset, int length,
 	struct squashfs_page_actor *output)
 {
+	struct bvec_iter_all iter_all = {};
+	struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
 	struct squashfs_lzo *stream = strm;
 	void *buff = stream->input, *data;
-	int avail, i, bytes = length, res;
+	int bytes = length, res;
 	size_t out_len = output->length;

-	for (i = 0; i < b; i++) {
-		avail = min(bytes, msblk->devblksize - offset);
-		memcpy(buff, bh[i]->b_data + offset, avail);
+	while (bio_next_segment(bio, &iter_all)) {
+		int avail = min(bytes, ((int)bvec->bv_len) - offset);
+
+		data = page_address(bvec->bv_page) + bvec->bv_offset;
+		memcpy(buff, data + offset, avail);
 		buff += avail;
 		bytes -= avail;
 		offset = 0;
-		put_bh(bh[i]);
 	}

 	res = lzo1x_decompress_safe(stream->input, (size_t)length,
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -53,8 +53,8 @@ extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
 /* decompressor_xxx.c */
 extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
 extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
-extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **,
-	int, int, int, struct squashfs_page_actor *);
+extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *,
+				int, int, struct squashfs_page_actor *);
 extern int squashfs_max_decompressors(void);

 /* export.c */
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -23,7 +23,7 @@


 #include <linux/mutex.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
 #include <linux/slab.h>
 #include <linux/xz.h>
 #include <linux/bitops.h>
@@ -130,11 +130,12 @@ static void squashfs_xz_free(void *strm)


 static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
-	struct buffer_head **bh, int b, int offset, int length,
+	struct bio *bio, int offset, int length,
 	struct squashfs_page_actor *output)
 {
-	enum xz_ret xz_err;
-	int avail, total = 0, k = 0;
+	struct bvec_iter_all iter_all = {};
+	struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
+	int total = 0, error = 0;
 	struct squashfs_xz *stream = strm;

 	xz_dec_reset(stream->state);
@@ -144,11 +145,23 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	stream->buf.out_size = PAGE_SIZE;
 	stream->buf.out = squashfs_first_page(output);

-	do {
-		if (stream->buf.in_pos == stream->buf.in_size && k < b) {
-			avail = min(length, msblk->devblksize - offset);
+	for (;;) {
+		enum xz_ret xz_err;
+
+		if (stream->buf.in_pos == stream->buf.in_size) {
+			const void *data;
+			int avail;
+
+			if (!bio_next_segment(bio, &iter_all)) {
+				/* XZ_STREAM_END must be reached. */
+				error = -EIO;
+				break;
+			}
+
+			avail = min(length, ((int)bvec->bv_len) - offset);
+			data = page_address(bvec->bv_page) + bvec->bv_offset;
 			length -= avail;
-			stream->buf.in = bh[k]->b_data + offset;
+			stream->buf.in = data + offset;
 			stream->buf.in_size = avail;
 			stream->buf.in_pos = 0;
 			offset = 0;
@@ -163,23 +176,17 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		}

 		xz_err = xz_dec_run(stream->state, &stream->buf);
-
-		if (stream->buf.in_pos == stream->buf.in_size && k < b)
-			put_bh(bh[k++]);
-	} while (xz_err == XZ_OK);
+		if (xz_err == XZ_STREAM_END)
+			break;
+		if (xz_err != XZ_OK) {
+			error = -EIO;
+			break;
+		}
+	}

 	squashfs_finish_page(output);

-	if (xz_err != XZ_STREAM_END || k < b)
-		goto out;
-
-	return total + stream->buf.out_pos;
-
-out:
-	for (; k < b; k++)
-		put_bh(bh[k]);
-
-	return -EIO;
+	return error ? error : total + stream->buf.out_pos;
 }

 const struct squashfs_decompressor squashfs_xz_comp_ops = {
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -23,7 +23,7 @@


 #include <linux/mutex.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
 #include <linux/slab.h>
 #include <linux/zlib.h>
 #include <linux/vmalloc.h>
@@ -63,21 +63,35 @@ static void zlib_free(void *strm)


 static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
-	struct buffer_head **bh, int b, int offset, int length,
+	struct bio *bio, int offset, int length,
 	struct squashfs_page_actor *output)
 {
-	int zlib_err, zlib_init = 0, k = 0;
+	struct bvec_iter_all iter_all = {};
+	struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
+	int zlib_init = 0, error = 0;
 	z_stream *stream = strm;

 	stream->avail_out = PAGE_SIZE;
 	stream->next_out = squashfs_first_page(output);
 	stream->avail_in = 0;

-	do {
-		if (stream->avail_in == 0 && k < b) {
-			int avail = min(length, msblk->devblksize - offset);
+	for (;;) {
+		int zlib_err;
+
+		if (stream->avail_in == 0) {
+			const void *data;
+			int avail;
+
+			if (!bio_next_segment(bio, &iter_all)) {
+				/* Z_STREAM_END must be reached. */
+				error = -EIO;
+				break;
+			}
+
+			avail = min(length, ((int)bvec->bv_len) - offset);
+			data = page_address(bvec->bv_page) + bvec->bv_offset;
 			length -= avail;
-			stream->next_in = bh[k]->b_data + offset;
+			stream->next_in = data + offset;
 			stream->avail_in = avail;
 			offset = 0;
 		}
@@ -91,37 +105,28 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		if (!zlib_init) {
 			zlib_err = zlib_inflateInit(stream);
 			if (zlib_err != Z_OK) {
-				squashfs_finish_page(output);
-				goto out;
+				error = -EIO;
+				break;
 			}
 			zlib_init = 1;
 		}

 		zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
-
-		if (stream->avail_in == 0 && k < b)
-			put_bh(bh[k++]);
-	} while (zlib_err == Z_OK);
+		if (zlib_err == Z_STREAM_END)
+			break;
+		if (zlib_err != Z_OK) {
+			error = -EIO;
+			break;
+		}
+	}

 	squashfs_finish_page(output);

-	if (zlib_err != Z_STREAM_END)
-		goto out;
+	if (!error)
+		if (zlib_inflateEnd(stream) != Z_OK)
+			error = -EIO;

-	zlib_err = zlib_inflateEnd(stream);
-	if (zlib_err != Z_OK)
-		goto out;
-
-	if (k < b)
-		goto out;
-
-	return stream->total_out;
-
-out:
-	for (; k < b; k++)
-		put_bh(bh[k]);
-
-	return -EIO;
+	return error ? error : stream->total_out;
 }

 const struct squashfs_decompressor squashfs_zlib_comp_ops = {
--- a/fs/squashfs/zstd_wrapper.c
+++ b/fs/squashfs/zstd_wrapper.c
@@ -18,7 +18,7 @@
 */

 #include <linux/mutex.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
 #include <linux/slab.h>
 #include <linux/zstd.h>
 #include <linux/vmalloc.h>
@@ -68,33 +68,44 @@ static void zstd_free(void *strm)


 static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
-	struct buffer_head **bh, int b, int offset, int length,
+	struct bio *bio, int offset, int length,
 	struct squashfs_page_actor *output)
 {
 	struct workspace *wksp = strm;
 	ZSTD_DStream *stream;
 	size_t total_out = 0;
-	size_t zstd_err;
-	int k = 0;
+	int error = 0;
 	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
 	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+	struct bvec_iter_all iter_all = {};
+	struct bio_vec *bvec = bvec_init_iter_all(&iter_all);

 	stream = ZSTD_initDStream(wksp->window_size, wksp->mem, wksp->mem_size);

 	if (!stream) {
 		ERROR("Failed to initialize zstd decompressor\n");
-		goto out;
+		return -EIO;
 	}

 	out_buf.size = PAGE_SIZE;
 	out_buf.dst = squashfs_first_page(output);

-	do {
-		if (in_buf.pos == in_buf.size && k < b) {
-			int avail = min(length, msblk->devblksize - offset);
+	for (;;) {
+		size_t zstd_err;

+		if (in_buf.pos == in_buf.size) {
+			const void *data;
+			int avail;
+
+			if (!bio_next_segment(bio, &iter_all)) {
+				error = -EIO;
+				break;
+			}
+
+			avail = min(length, ((int)bvec->bv_len) - offset);
+			data = page_address(bvec->bv_page) + bvec->bv_offset;
 			length -= avail;
-			in_buf.src = bh[k]->b_data + offset;
+			in_buf.src = data + offset;
 			in_buf.size = avail;
 			in_buf.pos = 0;
 			offset = 0;
@@ -106,8 +117,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
 				/* Shouldn't run out of pages
 				 * before stream is done.
 				 */
-				squashfs_finish_page(output);
-				goto out;
+				error = -EIO;
+				break;
 			}
 			out_buf.pos = 0;
 			out_buf.size = PAGE_SIZE;
@@ -116,29 +127,20 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		total_out -= out_buf.pos;
 		zstd_err = ZSTD_decompressStream(stream, &out_buf, &in_buf);
 		total_out += out_buf.pos; /* add the additional data produced */
+		if (zstd_err == 0)
+			break;

-		if (in_buf.pos == in_buf.size && k < b)
-			put_bh(bh[k++]);
-	} while (zstd_err != 0 && !ZSTD_isError(zstd_err));
+		if (ZSTD_isError(zstd_err)) {
+			ERROR("zstd decompression error: %d\n",
+					(int)ZSTD_getErrorCode(zstd_err));
+			error = -EIO;
+			break;
+		}
+	}

 	squashfs_finish_page(output);

-	if (ZSTD_isError(zstd_err)) {
-		ERROR("zstd decompression error: %d\n",
-				(int)ZSTD_getErrorCode(zstd_err));
-		goto out;
-	}
-
-	if (k < b)
-		goto out;
-
-	return (int)total_out;
-
-out:
-	for (; k < b; k++)
-		put_bh(bh[k]);
-
-	return -EIO;
+	return error ? error : total_out;
 }

 const struct squashfs_decompressor squashfs_zstd_comp_ops = {
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -410,6 +410,7 @@ struct blk_zone_report_hdr {
 	u8		padding[60];
 };

+extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
 			       sector_t sector, struct blk_zone *zones,
 			       unsigned int *nr_zones, gfp_t gfp_mask);
@@ -423,6 +424,10 @@ extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,

 #else /* CONFIG_BLK_DEV_ZONED */

+static inline unsigned int blkdev_nr_zones(struct block_device *bdev)
+{
+	return 0;
+}
 static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
 					    fmode_t mode, unsigned int cmd,
 					    unsigned long arg)
@@ -818,6 +823,11 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
 }

 #ifdef CONFIG_BLK_DEV_ZONED
+static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+{
+	return blk_queue_is_zoned(q) ? q->nr_zones : 0;
+}
+
 static inline unsigned int blk_queue_zone_no(struct request_queue *q,
 					     sector_t sector)
 {
@@ -833,6 +843,11 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
 		return false;
 	return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
 }
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+{
+	return 0;
+}
 #endif /* CONFIG_BLK_DEV_ZONED */

 static inline bool rq_is_sync(struct request *rq)
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -113,7 +113,7 @@ struct elevator_mq_ops {
 	void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
 	struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
 	bool (*has_work)(struct blk_mq_hw_ctx *);
-	void (*completed_request)(struct request *);
+	void (*completed_request)(struct request *, u64);
 	void (*started_request)(struct request *);
 	void (*requeue_request)(struct request *);
 	struct request *(*former_request)(struct request_queue *, struct request *);
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -49,22 +49,24 @@

 /* Exported common interfaces */

-#ifdef CONFIG_PREEMPT_RCU
-void call_rcu(struct rcu_head *head, rcu_callback_t func);
-#else /* #ifdef CONFIG_PREEMPT_RCU */
-#define	call_rcu	call_rcu_sched
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-
-void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
-void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
+#ifndef CONFIG_TINY_RCU
 void synchronize_sched(void);
+void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
+#endif
+
+void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void rcu_barrier_tasks(void);
+void synchronize_rcu(void);
+
+static inline void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
+{
+	call_rcu(head, func);
+}

 #ifdef CONFIG_PREEMPT_RCU

 void __rcu_read_lock(void);
 void __rcu_read_unlock(void);
-void synchronize_rcu(void);

 /*
 * Defined as a macro as it is a very low level header included from
@@ -86,11 +88,6 @@ static inline void __rcu_read_unlock(void)
 	preempt_enable();
 }

-static inline void synchronize_rcu(void)
-{
-	synchronize_sched();
-}
-
 static inline int rcu_preempt_depth(void)
 {
 	return 0;
@@ -101,9 +98,7 @@ static inline int rcu_preempt_depth(void)
 /* Internal to kernel */
 void rcu_init(void);
 extern int rcu_scheduler_active __read_mostly;
-void rcu_sched_qs(void);
-void rcu_bh_qs(void);
-void rcu_check_callbacks(int user);
+void rcu_sched_clock_irq(int user);
 void rcu_report_dead(unsigned int cpu);
 void rcutree_migrate_callbacks(int cpu);

@@ -362,8 +357,7 @@ static inline void rcu_preempt_sleep_check(void) { }
 * and rcu_assign_pointer().  Some of these could be folded into their
 * callers, but they are left separate in order to ease introduction of
 * multiple flavors of pointers to match the multiple flavors of RCU
- * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in
- * the future.
+ * (e.g., __rcu_sched, and __srcu), should this make sense in the future.
 */

 #ifdef __CHECKER__
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -36,6 +36,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
 /* Never flag non-existent other CPUs! */
 static inline bool rcu_eqs_special_set(int cpu) { return false; }

+static inline void synchronize_sched(void)
+{
+	synchronize_rcu();
+}
+
 static inline unsigned long get_state_synchronize_rcu(void)
 {
 	return 0;
@@ -56,17 +61,16 @@ static inline void cond_synchronize_sched(unsigned long oldstate)
 	might_sleep();
 }

-extern void rcu_barrier_bh(void);
-extern void rcu_barrier_sched(void);
+extern void rcu_barrier(void);

-static inline void synchronize_rcu_expedited(void)
+static inline void rcu_barrier_sched(void)
 {
-	synchronize_sched();	/* Only one CPU, so pretty fast anyway!!! */
+	rcu_barrier();  /* Only one CPU, so only one list of callbacks! */
 }

-static inline void rcu_barrier(void)
+static inline void rcu_barrier_bh(void)
 {
-	rcu_barrier_sched();  /* Only one CPU, so only one list of callbacks! */
+	rcu_barrier();
 }

 static inline void synchronize_rcu_bh(void)
@@ -79,25 +83,36 @@ static inline void synchronize_rcu_bh_expedited(void)
 	synchronize_sched();
 }

+static inline void synchronize_rcu_expedited(void)
+{
+	synchronize_sched();
+}
+
 static inline void synchronize_sched_expedited(void)
 {
 	synchronize_sched();
 }

-static inline void kfree_call_rcu(struct rcu_head *head,
-				  rcu_callback_t func)
+static inline void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
 {
 	call_rcu(head, func);
 }

+static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+	call_rcu(head, func);
+}
+
+void rcu_qs(void);
+
 static inline void rcu_softirq_qs(void)
 {
-	rcu_sched_qs();
+	rcu_qs();
 }

 #define rcu_note_context_switch(preempt) \
 	do { \
-		rcu_sched_qs(); \
+		rcu_qs(); \
 		rcu_tasks_qs(current); \
 	} while (0)

--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -45,10 +45,17 @@ static inline void rcu_virt_note_context_switch(int cpu)
 	rcu_note_context_switch(false);
 }

-void synchronize_rcu_bh(void);
-void synchronize_sched_expedited(void);
+static inline void synchronize_rcu_bh(void)
+{
+	synchronize_rcu();
+}
 void synchronize_rcu_expedited(void);

+static inline void synchronize_sched_expedited(void)
+{
+	synchronize_rcu_expedited();
+}
+
 void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);

 /**
@@ -69,7 +76,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
 */
 static inline void synchronize_rcu_bh_expedited(void)
 {
-	synchronize_sched_expedited();
+	synchronize_rcu_expedited();
 }

 void rcu_barrier(void);
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -36,6 +36,7 @@ struct srcu_struct {
 	struct rcu_head *srcu_cb_head;	/* Pending callbacks: Head. */
 	struct rcu_head **srcu_cb_tail;	/* Pending callbacks: Tail. */
 	struct work_struct srcu_work;	/* For driving grace periods. */
+	struct list_head srcu_boot_entry; /* Early-boot callbacks. */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
@@ -48,6 +49,7 @@ void srcu_drive_gp(struct work_struct *wp);
 	.srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq),	\
 	.srcu_cb_tail = &name.srcu_cb_head,				\
 	.srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp),	\
+	.srcu_boot_entry = LIST_HEAD_INIT(name.srcu_boot_entry),	\
 	__SRCU_DEP_MAP_INIT(name)					\
 }

--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -94,6 +94,7 @@ struct srcu_struct {
 						/*  callback for the barrier */
 						/*  operation. */
 	struct delayed_work work;
+	struct list_head srcu_boot_entry;	/* Early-boot callbacks. */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
@@ -105,12 +106,13 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN2	2

 #define __SRCU_STRUCT_INIT(name, pcpu_name)				\
-	{								\
-		.sda = &pcpu_name,					\
-		.lock = __SPIN_LOCK_UNLOCKED(name.lock),		\
-		.srcu_gp_seq_needed = 0 - 1,				\
-		__SRCU_DEP_MAP_INIT(name)				\
-	}
+{									\
+	.sda = &pcpu_name,						\
+	.lock = __SPIN_LOCK_UNLOCKED(name.lock),			\
+	.srcu_gp_seq_needed = -1UL,					\
+	.srcu_boot_entry = LIST_HEAD_INIT(name.srcu_boot_entry),	\
+	__SRCU_DEP_MAP_INIT(name)					\
+}

 /*
 * Define and initialize a srcu struct at build time.
--- a/include/trace/events/kyber.h
+++ b/include/trace/events/kyber.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kyber
+
+#if !defined(_TRACE_KYBER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KYBER_H
+
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+#define DOMAIN_LEN		16
+#define LATENCY_TYPE_LEN	8
+
+TRACE_EVENT(kyber_latency,
+
+	TP_PROTO(struct request_queue *q, const char *domain, const char *type,
+		 unsigned int percentile, unsigned int numerator,
+		 unsigned int denominator, unsigned int samples),
+
+	TP_ARGS(q, domain, type, percentile, numerator, denominator, samples),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev				)
+		__array(	char,	domain,	DOMAIN_LEN		)
+		__array(	char,	type,	LATENCY_TYPE_LEN	)
+		__field(	u8,	percentile			)
+		__field(	u8,	numerator			)
+		__field(	u8,	denominator			)
+		__field(	unsigned int,	samples			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent)));
+		strlcpy(__entry->domain, domain, DOMAIN_LEN);
+		strlcpy(__entry->type, type, DOMAIN_LEN);
+		__entry->percentile	= percentile;
+		__entry->numerator	= numerator;
+		__entry->denominator	= denominator;
+		__entry->samples	= samples;
+	),
+
+	TP_printk("%d,%d %s %s p%u %u/%u samples=%u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->domain,
+		  __entry->type, __entry->percentile, __entry->numerator,
+		  __entry->denominator, __entry->samples)
+);
+
+TRACE_EVENT(kyber_adjust,
+
+	TP_PROTO(struct request_queue *q, const char *domain,
+		 unsigned int depth),
+
+	TP_ARGS(q, domain, depth),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__array(	char,	domain,	DOMAIN_LEN	)
+		__field(	unsigned int,	depth		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent)));
+		strlcpy(__entry->domain, domain, DOMAIN_LEN);
+		__entry->depth		= depth;
+	),
+
+	TP_printk("%d,%d %s %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->domain,
+		  __entry->depth)
+);
+
+TRACE_EVENT(kyber_throttled,
+
+	TP_PROTO(struct request_queue *q, const char *domain),
+
+	TP_ARGS(q, domain),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__array(	char,	domain,	DOMAIN_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent)));
+		strlcpy(__entry->domain, domain, DOMAIN_LEN);
+	),
+
+	TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->domain)
+);
+
+#define _TRACE_KYBER_H
+#endif /* _TRACE_KYBER_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -137,8 +137,11 @@ struct blk_zone_range {
 *                 sector specified in the report request structure.
 * @BLKRESETZONE: Reset the write pointer of the zones in the specified
 *                sector range. The sector range must be zone aligned.
+ * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
 */
 #define BLKREPORTZONE	_IOWR(0x12, 130, struct blk_zone_report)
 #define BLKRESETZONE	_IOW(0x12, 131, struct blk_zone_range)
+#define BLKGETZONESZ	_IOW(0x12, 132, __u32)
+#define BLKGETNRZONES	_IOW(0x12, 133, __u32)

 #endif /* _UAPI_BLKZONED_H */
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -816,7 +816,6 @@ config MEMCG_SWAP
 config MEMCG_SWAP_ENABLED
 	bool "Swap controller enabled by default"
 	depends on MEMCG_SWAP
-	default y
 	help
 	  Memory Resource Controller Swap Extension comes with its price in
 	  a bigger memory consumption. General purpose distribution kernels
@@ -830,7 +829,6 @@ config MEMCG_SWAP_ENABLED
 config MEMCG_KMEM
 	bool
 	depends on MEMCG && !SLOB
-	default y

 config BLK_CGROUP
 	bool "IO controller"
@@ -1885,7 +1883,6 @@ config SLAB_FREELIST_HARDENED
 	  freelist exploit methods.

 config SLUB_CPU_PARTIAL
-	default y
 	depends on SLUB && SMP
 	bool "SLUB per cpu partial cache"
 	help
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -148,7 +148,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
 	if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
 		return -EIO;

-	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
 		return -ENOMEM;

 	if (type)
@@ -247,7 +247,7 @@ static ssize_t default_affinity_write(struct file *file,
 	cpumask_var_t new_value;
 	int err;

-	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
 		return -ENOMEM;

 	err = cpumask_parse_user(buffer, count, new_value);
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -17,7 +17,6 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
-#include <linux/interrupt.h>
 #include <asm/processor.h>


@@ -163,9 +162,7 @@ static void irq_work_run_list(struct llist_head *list)
 		flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
 		atomic_xchg(&work->flags, flags);

-		check_start_time(ts);
 		work->func(work);
-		check_process_time("irq_work %ps", ts, work->func);
 		/*
 		 * Clear the BUSY bit and return to the free state if
 		 * no-one else claimed it meanwhile.
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -343,7 +343,7 @@ static int profile_dead_cpu(unsigned int cpu)
 	struct page *page;
 	int i;

-	if (prof_cpu_mask != NULL)
+	if (cpumask_available(prof_cpu_mask))
 		cpumask_clear_cpu(cpu, prof_cpu_mask);

 	for (i = 0; i < 2; i++) {
@@ -380,7 +380,7 @@ static int profile_prepare_cpu(unsigned int cpu)

 static int profile_online_cpu(unsigned int cpu)
 {
-	if (prof_cpu_mask != NULL)
+	if (cpumask_available(prof_cpu_mask))
 		cpumask_set_cpu(cpu, prof_cpu_mask);

 	return 0;
@@ -410,7 +410,7 @@ void profile_tick(int type)
 {
 	struct pt_regs *regs = get_irq_regs();

-	if (!user_mode(regs) && prof_cpu_mask != NULL &&
+	if (!user_mode(regs) && cpumask_available(prof_cpu_mask) &&
 	    cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
 		profile_hit(type, (void *)profile_pc(regs));
 }
@@ -437,7 +437,7 @@ static ssize_t prof_cpu_mask_proc_write(struct file *file,
 	cpumask_var_t new_value;
 	int err;

-	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
 		return -ENOMEM;

 	err = cpumask_parse_user(buffer, count, new_value);
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -443,6 +443,12 @@ do {									\

 #endif /* #if defined(SRCU) || !defined(TINY_RCU) */

+#ifdef CONFIG_SRCU
+void srcu_init(void);
+#else /* #ifdef CONFIG_SRCU */
+static inline void srcu_init(void) { }
+#endif /* #else #ifdef CONFIG_SRCU */
+
 #ifdef CONFIG_TINY_RCU
 /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
 static inline bool rcu_gp_is_normal(void) { return true; }
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -34,6 +34,8 @@
 #include "rcu.h"

 int rcu_scheduler_active __read_mostly;
+static LIST_HEAD(srcu_boot_list);
+static bool srcu_init_done;

 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
@@ -46,6 +48,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 	sp->srcu_gp_waiting = false;
 	sp->srcu_idx = 0;
 	INIT_WORK(&sp->srcu_work, srcu_drive_gp);
+	INIT_LIST_HEAD(&sp->srcu_boot_entry);
 	return 0;
 }

@@ -179,8 +182,12 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 	*sp->srcu_cb_tail = rhp;
 	sp->srcu_cb_tail = &rhp->next;
 	local_irq_restore(flags);
-	if (!READ_ONCE(sp->srcu_gp_running))
-		schedule_work(&sp->srcu_work);
+	if (!READ_ONCE(sp->srcu_gp_running)) {
+		if (likely(srcu_init_done))
+			schedule_work(&sp->srcu_work);
+		else if (list_empty(&sp->srcu_boot_entry))
+			list_add(&sp->srcu_boot_entry, &srcu_boot_list);
+	}
 }
 EXPORT_SYMBOL_GPL(call_srcu);

@@ -204,3 +211,21 @@ void __init rcu_scheduler_starting(void)
 {
 	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
 }
+
+/*
+ * Queue work for srcu_struct structures with early boot callbacks.
+ * The work won't actually execute until the workqueue initialization
+ * phase that takes place after the scheduler starts.
+ */
+void __init srcu_init(void)
+{
+	struct srcu_struct *sp;
+
+	srcu_init_done = true;
+	while (!list_empty(&srcu_boot_list)) {
+		sp = list_first_entry(&srcu_boot_list,
+				      struct srcu_struct, srcu_boot_entry);
+		list_del_init(&sp->srcu_boot_entry);
+		schedule_work(&sp->srcu_work);
+	}
+}
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -51,6 +51,10 @@ module_param(exp_holdoff, ulong, 0444);
 static ulong counter_wrap_check = (ULONG_MAX >> 2);
 module_param(counter_wrap_check, ulong, 0444);

+/* Early-boot callback-management, so early that no lock is required! */
+static LIST_HEAD(srcu_boot_list);
+static bool __read_mostly srcu_init_done;
+
 static void srcu_invoke_callbacks(struct work_struct *work);
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
 static void process_srcu(struct work_struct *work);
@@ -182,6 +186,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
 	mutex_init(&sp->srcu_barrier_mutex);
 	atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
+	INIT_LIST_HEAD(&sp->srcu_boot_entry);
 	if (!is_static)
 		sp->sda = alloc_percpu(struct srcu_data);
 	init_srcu_struct_nodes(sp, is_static);
@@ -235,7 +240,6 @@ static void check_init_srcu_struct(struct srcu_struct *sp)
 {
 	unsigned long flags;

-	WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
 	/* The smp_load_acquire() pairs with the smp_store_release(). */
 	if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
 		return; /* Already initialized. */
@@ -703,7 +707,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
 	    rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
 		WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
 		srcu_gp_start(sp);
-		queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp));
+		if (likely(srcu_init_done))
+			queue_delayed_work(rcu_gp_wq, &sp->work,
+					   srcu_get_delay(sp));
+		else if (list_empty(&sp->srcu_boot_entry))
+			list_add(&sp->srcu_boot_entry, &srcu_boot_list);
 	}
 	spin_unlock_irqrestore_rcu_node(sp, flags);
 }
@@ -1310,3 +1318,17 @@ static int __init srcu_bootup_announce(void)
 	return 0;
 }
 early_initcall(srcu_bootup_announce);
+
+void __init srcu_init(void)
+{
+	struct srcu_struct *sp;
+
+	srcu_init_done = true;
+	while (!list_empty(&srcu_boot_list)) {
+		sp = list_first_entry(&srcu_boot_list,
+				      struct srcu_struct, srcu_boot_entry);
+		check_init_srcu_struct(sp);
+		list_del_init(&sp->srcu_boot_entry);
+		queue_work(rcu_gp_wq, &sp->work.work);
+	}
+}
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
 *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
 * Copyright IBM Corporation, 2008
 *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *		Documentation/RCU
@@ -46,69 +33,28 @@ struct rcu_ctrlblk {
 };

 /* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-	.donetail	= &rcu_sched_ctrlblk.rcucblist,
-	.curtail	= &rcu_sched_ctrlblk.rcucblist,
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.donetail	= &rcu_ctrlblk.rcucblist,
+	.curtail	= &rcu_ctrlblk.rcucblist,
 };

-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.donetail	= &rcu_bh_ctrlblk.rcucblist,
-	.curtail	= &rcu_bh_ctrlblk.rcucblist,
-};
-
-void rcu_barrier_bh(void)
+void rcu_barrier(void)
 {
-	wait_rcu_gp(call_rcu_bh);
+	wait_rcu_gp(call_rcu);
 }
-EXPORT_SYMBOL(rcu_barrier_bh);
+EXPORT_SYMBOL(rcu_barrier);

-void rcu_barrier_sched(void)
+/* Record an rcu quiescent state.  */
+void rcu_qs(void)
 {
-	wait_rcu_gp(call_rcu_sched);
-}
-EXPORT_SYMBOL(rcu_barrier_sched);
+	unsigned long flags;

-/*
- * Helper function for rcu_sched_qs() and rcu_bh_qs().
- * Also irqs are disabled to avoid confusion due to interrupt handlers
- * invoking call_rcu().
- */
-static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
-{
-	if (rcp->donetail != rcp->curtail) {
-		rcp->donetail = rcp->curtail;
-		return 1;
+	local_irq_save(flags);
+
+	if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
+		rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
+		raise_softirq_irqoff(RCU_SOFTIRQ);
 	}
-
-	return 0;
-}
-
-/*
- * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
- * are at it, given that any rcu quiescent state is also an rcu_bh
- * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
- */
-void rcu_sched_qs(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
-	    rcu_qsctr_help(&rcu_bh_ctrlblk))
-		raise_softirq(RCU_SOFTIRQ);
-	local_irq_restore(flags);
-}
-
-/*
- * Record an rcu_bh quiescent state.
- */
-void rcu_bh_qs(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-		raise_softirq(RCU_SOFTIRQ);
 	local_irq_restore(flags);
 }

@@ -118,36 +64,35 @@ void rcu_bh_qs(void)
 * be called from hardirq context.  It is normally called from the
 * scheduling-clock interrupt.
 */
-void rcu_check_callbacks(int user)
+void rcu_sched_clock_irq(int user)
 {
-	if (user)
-		rcu_sched_qs();
-	if (user || !in_softirq())
-		rcu_bh_qs();
+	if (user) {
+		rcu_qs();
+	} else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
+		set_tsk_need_resched(current);
+		set_preempt_need_resched();
+	}
 }

-/*
- * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
- * whose grace period has elapsed.
- */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+/* Invoke the RCU callbacks whose grace period has elapsed.  */
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
 {
 	struct rcu_head *next, *list;
 	unsigned long flags;

 	/* Move the ready-to-invoke callbacks to a local list. */
 	local_irq_save(flags);
-	if (rcp->donetail == &rcp->rcucblist) {
+	if (rcu_ctrlblk.donetail == &rcu_ctrlblk.rcucblist) {
 		/* No callbacks ready, so just leave. */
 		local_irq_restore(flags);
 		return;
 	}
-	list = rcp->rcucblist;
-	rcp->rcucblist = *rcp->donetail;
-	*rcp->donetail = NULL;
-	if (rcp->curtail == rcp->donetail)
-		rcp->curtail = &rcp->rcucblist;
-	rcp->donetail = &rcp->rcucblist;
+	list = rcu_ctrlblk.rcucblist;
+	rcu_ctrlblk.rcucblist = *rcu_ctrlblk.donetail;
+	*rcu_ctrlblk.donetail = NULL;
+	if (rcu_ctrlblk.curtail == rcu_ctrlblk.donetail)
+		rcu_ctrlblk.curtail = &rcu_ctrlblk.rcucblist;
+	rcu_ctrlblk.donetail = &rcu_ctrlblk.rcucblist;
 	local_irq_restore(flags);

 	/* Invoke the callbacks on the local list. */
@@ -162,37 +107,31 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	}
 }

-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
-{
-	__rcu_process_callbacks(&rcu_sched_ctrlblk);
-	__rcu_process_callbacks(&rcu_bh_ctrlblk);
-}
-
 /*
 * Wait for a grace period to elapse.  But it is illegal to invoke
- * synchronize_sched() from within an RCU read-side critical section.
- * Therefore, any legal call to synchronize_sched() is a quiescent
- * state, and so on a UP system, synchronize_sched() need do nothing.
- * Ditto for synchronize_rcu_bh().  (But Lai Jiangshan points out the
- * benefits of doing might_sleep() to reduce latency.)
+ * synchronize_rcu() from within an RCU read-side critical section.
+ * Therefore, any legal call to synchronize_rcu() is a quiescent
+ * state, and so on a UP system, synchronize_rcu() need do nothing.
+ * (But Lai Jiangshan points out the benefits of doing might_sleep()
+ * to reduce latency.)
 *
 * Cool, huh?  (Due to Josh Triplett.)
 */
-void synchronize_sched(void)
+void synchronize_rcu(void)
 {
 	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
 			 lock_is_held(&rcu_lock_map) ||
 			 lock_is_held(&rcu_sched_lock_map),
-			 "Illegal synchronize_sched() in RCU read-side critical section");
+			 "Illegal synchronize_rcu() in RCU read-side critical section");
 }
-EXPORT_SYMBOL_GPL(synchronize_sched);
+EXPORT_SYMBOL_GPL(synchronize_rcu);

 /*
- * Helper function for call_rcu() and call_rcu_bh().
+ * Post an RCU callback to be invoked after the end of an RCU grace
+ * period.  But since we have but one CPU, that would be after any
+ * quiescent state.
 */
-static void __call_rcu(struct rcu_head *head,
-		       rcu_callback_t func,
-		       struct rcu_ctrlblk *rcp)
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	unsigned long flags;

@@ -201,39 +140,20 @@ static void __call_rcu(struct rcu_head *head,
 	head->next = NULL;

 	local_irq_save(flags);
-	*rcp->curtail = head;
-	rcp->curtail = &head->next;
+	*rcu_ctrlblk.curtail = head;
+	rcu_ctrlblk.curtail = &head->next;
 	local_irq_restore(flags);

 	if (unlikely(is_idle_task(current))) {
-		/* force scheduling for rcu_sched_qs() */
+		/* force scheduling for rcu_qs() */
 		resched_cpu(0);
 	}
 }
-
-/*
- * Post an RCU callback to be invoked after the end of an RCU-sched grace
- * period.  But since we have but one CPU, that would be after any
- * quiescent state.
- */
-void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
-{
-	__call_rcu(head, func, &rcu_sched_ctrlblk);
-}
-EXPORT_SYMBOL_GPL(call_rcu_sched);
-
-/*
- * Post an RCU bottom-half callback to be invoked after any subsequent
- * quiescent state.
- */
-void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
-{
-	__call_rcu(head, func, &rcu_bh_ctrlblk);
-}
-EXPORT_SYMBOL_GPL(call_rcu_bh);
+EXPORT_SYMBOL_GPL(call_rcu);

 void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 	rcu_early_boot_tests();
+	srcu_init();
 }
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -92,25 +92,29 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var

 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
 DEFINE_RCU_TPS(sname) \
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
-struct rcu_state sname##_state = { \
-	.level = { &sname##_state.node[0] }, \
-	.rda = &sname##_data, \
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data); \
+struct rcu_state rcu_state = { \
+	.level = { &rcu_state.node[0] }, \
+	.rda = &rcu_data, \
 	.call = cr, \
 	.gp_state = RCU_GP_IDLE, \
 	.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \
-	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+	.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
-	.exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
-	.exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \
-	.ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \
+	.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), \
+	.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), \
+	.ofl_lock = __SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock), \
 }

-RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
-RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
+#ifdef CONFIG_PREEMPT_RCU
+RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
+#else
+RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu);
+#endif

-static struct rcu_state *const rcu_state_p;
+static struct rcu_state *const rcu_state_p = &rcu_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_data;
 LIST_HEAD(rcu_struct_flavors);

 /* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -222,43 +226,9 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
 	return rcu_seq_state(rcu_seq_current(&rsp->gp_seq));
 }

-/*
- * Note a quiescent state.  Because we do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period, this just sets a flag.
- * The caller must have disabled preemption.
- */
-void rcu_sched_qs(void)
-{
-	RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!");
-	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
-		return;
-	trace_rcu_grace_period(TPS("rcu_sched"),
-			       __this_cpu_read(rcu_sched_data.gp_seq),
-			       TPS("cpuqs"));
-	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
-	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
-		return;
-	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
-	rcu_report_exp_rdp(&rcu_sched_state,
-			   this_cpu_ptr(&rcu_sched_data), true);
-}
-
-void rcu_bh_qs(void)
-{
-	RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
-	if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
-		trace_rcu_grace_period(TPS("rcu_bh"),
-				       __this_cpu_read(rcu_bh_data.gp_seq),
-				       TPS("cpuqs"));
-		__this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
-	}
-}
-
 void rcu_softirq_qs(void)
 {
-	rcu_sched_qs();
-	rcu_preempt_qs();
+	rcu_qs();
 	rcu_preempt_deferred_qs(current);
 }

@@ -432,31 +402,18 @@ static void rcu_momentary_dyntick_idle(void)
 	rcu_preempt_deferred_qs(current);
 }

-/*
- * Note a context switch.  This is a quiescent state for RCU-sched,
- * and requires special handling for preemptible RCU.
- * The caller must have disabled interrupts.
+/**
+ * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
+ *
+ * If the current CPU is idle or running at a first-level (not nested)
+ * interrupt from idle, return true.  The caller must have at least
+ * disabled preemption.
 */
-void rcu_note_context_switch(bool preempt)
+static int rcu_is_cpu_rrupt_from_idle(void)
 {
-	barrier(); /* Avoid RCU read-side critical sections leaking down. */
-	trace_rcu_utilization(TPS("Start context switch"));
-	rcu_sched_qs();
-	rcu_preempt_note_context_switch(preempt);
-	/* Load rcu_urgent_qs before other flags. */
-	if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
-		goto out;
-	this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
-	if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
-		rcu_momentary_dyntick_idle();
-	this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
-	if (!preempt)
-		rcu_tasks_qs(current);
-out:
-	trace_rcu_utilization(TPS("End context switch"));
-	barrier(); /* Avoid RCU read-side critical sections leaking up. */
+	return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 &&
+	       __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1;
 }
-EXPORT_SYMBOL_GPL(rcu_note_context_switch);

 /*
 * Register a quiescent state for all RCU flavors.  If there is an
@@ -490,8 +447,8 @@ void rcu_all_qs(void)
 		rcu_momentary_dyntick_idle();
 		local_irq_restore(flags);
 	}
-	if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)))
-		rcu_sched_qs();
+	if (unlikely(raw_cpu_read(rcu_data.cpu_no_qs.b.exp)))
+		rcu_qs();
 	this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 	preempt_enable();
@@ -572,7 +529,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_seq);
 */
 unsigned long rcu_sched_get_gp_seq(void)
 {
-	return READ_ONCE(rcu_sched_state.gp_seq);
+	return rcu_get_gp_seq();
 }
 EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq);

@@ -581,7 +538,7 @@ EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq);
 */
 unsigned long rcu_bh_get_gp_seq(void)
 {
-	return READ_ONCE(rcu_bh_state.gp_seq);
+	return READ_ONCE(rcu_state_p->gp_seq);
 }
 EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq);

@@ -603,7 +560,7 @@ EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
 */
 unsigned long rcu_exp_batches_completed_sched(void)
 {
-	return rcu_sched_state.expedited_sequence;
+	return rcu_state.expedited_sequence;
 }
 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);

@@ -621,7 +578,7 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 */
 void rcu_bh_force_quiescent_state(void)
 {
-	force_quiescent_state(&rcu_bh_state);
+	force_quiescent_state(rcu_state_p);
 }
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);

@@ -630,7 +587,7 @@ EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
 */
 void rcu_sched_force_quiescent_state(void)
 {
-	force_quiescent_state(&rcu_sched_state);
+	rcu_force_quiescent_state();
 }
 EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);

@@ -680,13 +637,9 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,

 	switch (test_type) {
 	case RCU_FLAVOR:
-		rsp = rcu_state_p;
-		break;
 	case RCU_BH_FLAVOR:
-		rsp = &rcu_bh_state;
-		break;
 	case RCU_SCHED_FLAVOR:
-		rsp = &rcu_sched_state;
+		rsp = rcu_state_p;
 		break;
 	default:
 		break;
@@ -1098,19 +1051,6 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);

 #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */

-/**
- * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
- *
- * If the current CPU is idle or running at a first-level (not nested)
- * interrupt from idle, return true.  The caller must have at least
- * disabled preemption.
- */
-static int rcu_is_cpu_rrupt_from_idle(void)
-{
-	return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 &&
-	       __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1;
-}
-
 /*
 * We are reporting a quiescent state on behalf of some other CPU, so
 * it is our responsibility to check for and handle potential overflow
@@ -2365,7 +2305,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	struct rcu_node *rnp_p;

 	raw_lockdep_assert_held_rcu_node(rnp);
-	if (WARN_ON_ONCE(rcu_state_p == &rcu_sched_state) ||
+	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) ||
 	    WARN_ON_ONCE(rsp != rcu_state_p) ||
 	    WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
 	    rnp->qsmask != 0) {
@@ -2640,47 +2580,18 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 }

 /*
- * Check to see if this CPU is in a non-context-switch quiescent state
- * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule RCU core processing.
- *
- * This function must be called from hardirq context.  It is normally
- * invoked from the scheduling-clock interrupt.
+ * This function is invoked from each scheduling-clock interrupt,
+ * and checks to see if this CPU is in a non-context-switch quiescent
+ * state, for example, user mode or idle loop.  It also schedules RCU
+ * core processing.  If the current grace period has gone on too long,
+ * it will ask the scheduler to manufacture a context switch for the sole
+ * purpose of providing a providing the needed quiescent state.
 */
-void rcu_check_callbacks(int user)
+void rcu_sched_clock_irq(int user)
 {
 	trace_rcu_utilization(TPS("Start scheduler-tick"));
 	increment_cpu_stall_ticks();
-	if (user || rcu_is_cpu_rrupt_from_idle()) {
-
-		/*
-		 * Get here if this CPU took its interrupt from user
-		 * mode or from the idle loop, and if this is not a
-		 * nested interrupt.  In this case, the CPU is in
-		 * a quiescent state, so note it.
-		 *
-		 * No memory barrier is required here because both
-		 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
-		 * variables that other CPUs neither access nor modify,
-		 * at least not while the corresponding CPU is online.
-		 */
-
-		rcu_sched_qs();
-		rcu_bh_qs();
-		rcu_note_voluntary_context_switch(current);
-
-	} else if (!in_softirq()) {
-
-		/*
-		 * Get here if this CPU did not take its interrupt from
-		 * softirq, in other words, if it is not interrupting
-		 * a rcu_bh read-side critical section.  This is an _bh
-		 * critical section, so note it.
-		 */
-
-		rcu_bh_qs();
-	}
-	rcu_preempt_check_callbacks();
+	rcu_flavor_sched_clock_irq(user);
 	/* The load-acquire pairs with the store-release setting to true. */
 	if (smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) {
 		/* Idle and userspace execution already are quiescent states. */
@@ -2715,7 +2626,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
 		mask = 0;
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->qsmask == 0) {
-			if (rcu_state_p == &rcu_sched_state ||
+			if (!IS_ENABLED(CONFIG_PREEMPT) ||
 			    rsp != rcu_state_p ||
 			    rcu_preempt_blocked_readers_cgp(rnp)) {
 				/*
@@ -3048,60 +2959,60 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
 	local_irq_restore(flags);
 }

+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.  However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
+ * may be nested.  In addition, regions of code across which interrupts,
+ * preemption, or softirqs have been disabled also serve as RCU read-side
+ * critical sections.  This includes hardware interrupt handlers, softirq
+ * handlers, and NMI handlers.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section.  On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu().  It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section.  Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
+ */
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+	__call_rcu(head, func, rcu_state_p, -1, 0);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
 /**
 * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
 * @head: structure to be used for queueing the RCU updates.
 * @func: actual callback function to be invoked after the grace period
 *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_sched() assumes
- * that the read-side critical sections end on enabling of preemption
- * or on voluntary preemption.
- * RCU read-side critical sections are delimited by:
- *
- * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
- * - anything that disables preemption.
- *
- *  These may be nested.
- *
- * See the description of call_rcu() for more detailed information on
- * memory ordering guarantees.
+ * This is transitional.
 */
 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
 {
-	__call_rcu(head, func, &rcu_sched_state, -1, 0);
+	call_rcu(head, func);
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);

-/**
- * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by:
- *
- * - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context, OR
- * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
- *
- * These may be nested.
- *
- * See the description of call_rcu() for more detailed information on
- * memory ordering guarantees.
- */
-void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
-{
-	__call_rcu(head, func, &rcu_bh_state, -1, 0);
-}
-EXPORT_SYMBOL_GPL(call_rcu_bh);
-
 /*
 * Queue an RCU callback for lazy invocation after a grace period.
 * This will likely be later named something like "call_rcu_lazy()",
@@ -3116,103 +3027,17 @@ void kfree_call_rcu(struct rcu_head *head,
 }
 EXPORT_SYMBOL_GPL(kfree_call_rcu);

-/*
- * Because a context switch is a grace period for RCU-sched and RCU-bh,
- * any blocking grace-period wait automatically implies a grace period
- * if there is only one CPU online at any point time during execution
- * of either synchronize_sched() or synchronize_rcu_bh().  It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds
- * some overhead: RCU still operates correctly.
- */
-static int rcu_blocking_is_gp(void)
-{
-	int ret;
-
-	might_sleep();  /* Check for RCU read-side critical section. */
-	preempt_disable();
-	ret = num_online_cpus() <= 1;
-	preempt_enable();
-	return ret;
-}
-
 /**
 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
 *
- * Control will return to the caller some time after a full rcu-sched
- * grace period has elapsed, in other words after all currently executing
- * rcu-sched read-side critical sections have completed.   These read-side
- * critical sections are delimited by rcu_read_lock_sched() and
- * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
- * local_irq_disable(), and so on may be used in place of
- * rcu_read_lock_sched().
- *
- * This means that all preempt_disable code sequences, including NMI and
- * non-threaded hardware-interrupt handlers, in progress on entry will
- * have completed before this primitive returns.  However, this does not
- * guarantee that softirq handlers will have completed, since in some
- * kernels, these handlers can run in process context, and can block.
- *
- * Note that this guarantee implies further memory-ordering guarantees.
- * On systems with more than one CPU, when synchronize_sched() returns,
- * each CPU is guaranteed to have executed a full memory barrier since the
- * end of its last RCU-sched read-side critical section whose beginning
- * preceded the call to synchronize_sched().  In addition, each CPU having
- * an RCU read-side critical section that extends beyond the return from
- * synchronize_sched() is guaranteed to have executed a full memory barrier
- * after the beginning of synchronize_sched() and before the beginning of
- * that RCU read-side critical section.  Note that these guarantees include
- * CPUs that are offline, idle, or executing in user mode, as well as CPUs
- * that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_sched(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
- * again only if the system has more than one CPU).
+ * This is transitional.
 */
 void synchronize_sched(void)
 {
-	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
-			 lock_is_held(&rcu_lock_map) ||
-			 lock_is_held(&rcu_sched_lock_map),
-			 "Illegal synchronize_sched() in RCU-sched read-side critical section");
-	if (rcu_blocking_is_gp())
-		return;
-	if (rcu_gp_is_expedited())
-		synchronize_sched_expedited();
-	else
-		wait_rcu_gp(call_rcu_sched);
+	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);

-/**
- * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
- *
- * Control will return to the caller some time after a full rcu_bh grace
- * period has elapsed, in other words after all currently executing rcu_bh
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
- * and may be nested.
- *
- * See the description of synchronize_sched() for more detailed information
- * on memory ordering guarantees.
- */
-void synchronize_rcu_bh(void)
-{
-	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
-			 lock_is_held(&rcu_lock_map) ||
-			 lock_is_held(&rcu_sched_lock_map),
-			 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
-	if (rcu_blocking_is_gp())
-		return;
-	if (rcu_gp_is_expedited())
-		synchronize_rcu_bh_expedited();
-	else
-		wait_rcu_gp(call_rcu_bh);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-
 /**
 * get_state_synchronize_rcu - Snapshot current RCU state
 *
@@ -3257,41 +3082,23 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 /**
 * get_state_synchronize_sched - Snapshot current RCU-sched state
 *
- * Returns a cookie that is used by a later call to cond_synchronize_sched()
- * to determine whether or not a full grace period has elapsed in the
- * meantime.
+ * This is transitional, and only used by rcutorture.
 */
 unsigned long get_state_synchronize_sched(void)
 {
-	/*
-	 * Any prior manipulation of RCU-protected data must happen
-	 * before the load from ->gp_seq.
-	 */
-	smp_mb();  /* ^^^ */
-	return rcu_seq_snap(&rcu_sched_state.gp_seq);
+	return get_state_synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_sched);

 /**
 * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
- *
 * @oldstate: return value from earlier call to get_state_synchronize_sched()
 *
- * If a full RCU-sched grace period has elapsed since the earlier call to
- * get_state_synchronize_sched(), just return.  Otherwise, invoke
- * synchronize_sched() to wait for a full grace period.
- *
- * Yes, this function does not take counter wrap into account.  But
- * counter wrap is harmless.  If the counter wraps, we have waited for
- * more than 2 billion grace periods (and way more on a 64-bit system!),
- * so waiting for one additional grace period should be just fine.
+ * This is transitional and only used by rcutorture.
 */
 void cond_synchronize_sched(unsigned long oldstate)
 {
-	if (!rcu_seq_done(&rcu_sched_state.gp_seq, oldstate))
-		synchronize_sched();
-	else
-		smp_mb(); /* Ensure GP ends before subsequent accesses. */
+	cond_synchronize_rcu(oldstate);
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_sched);

@@ -3524,16 +3331,32 @@ static void _rcu_barrier(struct rcu_state *rsp)
 */
 void rcu_barrier_bh(void)
 {
-	_rcu_barrier(&rcu_bh_state);
+	_rcu_barrier(rcu_state_p);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);

+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ *
+ * Note that this primitive does not necessarily wait for an RCU grace period
+ * to complete.  For example, if there are no RCU callbacks queued anywhere
+ * in the system, then rcu_barrier() is within its rights to return
+ * immediately, without waiting for anything, much less an RCU grace period.
+ */
+void rcu_barrier(void)
+{
+	_rcu_barrier(rcu_state_p);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
 /**
 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+ *
+ * This is transitional.
 */
 void rcu_barrier_sched(void)
 {
-	_rcu_barrier(&rcu_sched_state);
+	rcu_barrier();
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);

@@ -3832,8 +3655,7 @@ void rcu_report_dead(unsigned int cpu)

 	/* QS for any half-done expedited RCU-sched GP. */
 	preempt_disable();
-	rcu_report_exp_rdp(&rcu_sched_state,
-			   this_cpu_ptr(rcu_sched_state.rda), true);
+	rcu_report_exp_rdp(&rcu_state, this_cpu_ptr(rcu_state.rda), true);
 	preempt_enable();
 	rcu_preempt_deferred_qs(current);
 	for_each_rcu_flavor(rsp)
@@ -4169,17 +3991,15 @@ struct workqueue_struct *rcu_par_gp_wq;

 void __init rcu_init(void)
 {
-	int cpu;
+	int cpu = smp_processor_id();

 	rcu_early_boot_tests();

 	rcu_bootup_announce();
 	rcu_init_geometry();
-	rcu_init_one(&rcu_bh_state);
-	rcu_init_one(&rcu_sched_state);
+	rcu_init_one(&rcu_state);
 	if (dump_tree)
-		rcu_dump_rcu_node_tree(&rcu_sched_state);
-	__rcu_init_preempt();
+		rcu_dump_rcu_node_tree(&rcu_state);
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);

 	/*
@@ -4188,17 +4008,17 @@ void __init rcu_init(void)
 	 * or the scheduler are operational.
 	 */
 	pm_notifier(rcu_pm_notify, 0);
-	for_each_online_cpu(cpu) {
-		rcutree_prepare_cpu(cpu);
-		rcu_cpu_starting(cpu);
-		rcutree_online_cpu(cpu);
-	}
+	WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
+	rcutree_prepare_cpu(cpu);
+	rcu_cpu_starting(cpu);
+	rcutree_online_cpu(cpu);

 	/* Create workqueue for expedited GPs and for Tree SRCU. */
 	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_POWER_EFFICIENT | WQ_MEM_RECLAIM, 0);
 	WARN_ON(!rcu_gp_wq);
 	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
 	WARN_ON(!rcu_par_gp_wq);
+	srcu_init();
 }

 #include "tree_exp.h"
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -225,9 +225,6 @@ struct rcu_data {

 	/* 5) _rcu_barrier(), OOM callbacks, and expediting. */
 	struct rcu_head barrier_head;
-#ifdef CONFIG_RCU_FAST_NO_HZ
-	struct rcu_head oom_head;
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 	int exp_dynticks_snap;		/* Double-check need for IPI. */

 	/* 6) Callback offloading. */
@@ -433,8 +430,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);

 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
-static void rcu_preempt_qs(void);
-static void rcu_preempt_note_context_switch(bool preempt);
+static void rcu_qs(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
@@ -444,9 +440,8 @@ static int rcu_print_task_stall(struct rcu_node *rnp);
 static int rcu_print_task_exp_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_state *rsp,
 					    struct rcu_node *rnp);
-static void rcu_preempt_check_callbacks(void);
+static void rcu_flavor_sched_clock_irq(int user);
 void call_rcu(struct rcu_head *head, rcu_callback_t func);
-static void __init __rcu_init_preempt(void);
 static void dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp,
 			    int ncheck);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -266,7 +266,7 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
 	rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
 }

-/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+/* Common code for work-done checking. */
 static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s)
 {
 	if (rcu_exp_gp_seq_done(rsp, s)) {
@@ -338,45 +338,6 @@ fastpath:
 	return false;
 }

-/* Invoked on each online non-idle CPU for expedited quiescent state. */
-static void sync_sched_exp_handler(void *data)
-{
-	struct rcu_data *rdp;
-	struct rcu_node *rnp;
-	struct rcu_state *rsp = data;
-
-	rdp = this_cpu_ptr(rsp->rda);
-	rnp = rdp->mynode;
-	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
-	    __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
-		return;
-	if (rcu_is_cpu_rrupt_from_idle()) {
-		rcu_report_exp_rdp(&rcu_sched_state,
-				   this_cpu_ptr(&rcu_sched_data), true);
-		return;
-	}
-	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
-	/* Store .exp before .rcu_urgent_qs. */
-	smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
-	resched_cpu(smp_processor_id());
-}
-
-/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
-	struct rcu_data *rdp;
-	int ret;
-	struct rcu_node *rnp;
-	struct rcu_state *rsp = &rcu_sched_state;
-
-	rdp = per_cpu_ptr(rsp->rda, cpu);
-	rnp = rdp->mynode;
-	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
-		return;
-	ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
-	WARN_ON_ONCE(ret);
-}
-
 /*
 * Select the CPUs within the specified rcu_node that the upcoming
 * expedited grace period needs to wait for.
@@ -693,39 +654,6 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
 	mutex_unlock(&rsp->exp_mutex);
 }

-/**
- * synchronize_sched_expedited - Brute-force RCU-sched grace period
- *
- * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly.  This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.  In fact,
- * if you are using synchronize_sched_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_sched() instead.
- *
- * This implementation can be thought of as an application of sequence
- * locking to expedited grace periods, but using the sequence counter to
- * determine when someone else has already done the work instead of for
- * retrying readers.
- */
-void synchronize_sched_expedited(void)
-{
-	struct rcu_state *rsp = &rcu_sched_state;
-
-	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
-			 lock_is_held(&rcu_lock_map) ||
-			 lock_is_held(&rcu_sched_lock_map),
-			 "Illegal synchronize_sched_expedited() in RCU read-side critical section");
-
-	/* If only one CPU, this is automatically a grace period. */
-	if (rcu_blocking_is_gp())
-		return;
-
-	_synchronize_rcu_expedited(rsp, sync_sched_exp_handler);
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
 #ifdef CONFIG_PREEMPT_RCU

 /*
@@ -803,6 +731,11 @@ static void sync_rcu_exp_handler(void *info)
 		resched_cpu(rdp->cpu);
 }

+/* PREEMPT=y, so no RCU-sched to clean up after. */
+static void sync_sched_exp_online_cleanup(int cpu)
+{
+}
+
 /**
 * synchronize_rcu_expedited - Brute-force RCU grace period
 *
@@ -820,6 +753,8 @@ static void sync_rcu_exp_handler(void *info)
 * you are using synchronize_rcu_expedited() in a loop, please restructure
 * your code to batch your updates, and then Use a single synchronize_rcu()
 * instead.
+ *
+ * This has the same semantics as (but is more brutal than) synchronize_rcu().
 */
 void synchronize_rcu_expedited(void)
 {
@@ -838,13 +773,79 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);

 #else /* #ifdef CONFIG_PREEMPT_RCU */

+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static void sync_sched_exp_handler(void *data)
+{
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp = data;
+
+	rdp = this_cpu_ptr(rsp->rda);
+	rnp = rdp->mynode;
+	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+	    __this_cpu_read(rcu_data.cpu_no_qs.b.exp))
+		return;
+	if (rcu_is_cpu_rrupt_from_idle()) {
+		rcu_report_exp_rdp(&rcu_state, this_cpu_ptr(&rcu_data));
+		return;
+	}
+	__this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
+	/* Store .exp before .rcu_urgent_qs. */
+	smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
+	resched_cpu(smp_processor_id());
+}
+
+/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
+static void sync_sched_exp_online_cleanup(int cpu)
+{
+	struct rcu_data *rdp;
+	int ret;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp = &rcu_state;
+
+	rdp = per_cpu_ptr(rsp->rda, cpu);
+	rnp = rdp->mynode;
+	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+		return;
+	ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
+	WARN_ON_ONCE(ret);
+}
+
 /*
- * Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptible RCU does not exist, map to rcu-sched.
+ * Because a context switch is a grace period for RCU-sched, any blocking
+ * grace-period wait automatically implies a grace period if there
+ * is only one CPU online at any point time during execution of either
+ * synchronize_sched() or synchronize_rcu_bh().  It is OK to occasionally
+ * incorrectly indicate that there are multiple CPUs online when there
+ * was in fact only one the whole time, as this just adds some overhead:
+ * RCU still operates correctly.
 */
+static int rcu_blocking_is_gp(void)
+{
+	int ret;
+
+	might_sleep();  /* Check for RCU read-side critical section. */
+	preempt_disable();
+	ret = num_online_cpus() <= 1;
+	preempt_enable();
+	return ret;
+}
+
+/* PREEMPT=n implementation of synchronize_rcu_expedited(). */
 void synchronize_rcu_expedited(void)
 {
-	synchronize_sched_expedited();
+	struct rcu_state *rsp = &rcu_state;
+
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_sched_expedited() in RCU read-side critical section");
+
+	/* If only one CPU, this is automatically a grace period. */
+	if (rcu_blocking_is_gp())
+		return;
+
+	_synchronize_rcu_expedited(rsp, sync_sched_exp_handler);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);

--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -123,10 +123,6 @@ static void __init rcu_bootup_announce_oddness(void)

 #ifdef CONFIG_PREEMPT_RCU

-RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
-static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
-
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			       bool wake);
 static void rcu_read_unlock_special(struct task_struct *t);
@@ -306,15 +302,15 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
 *
 * Callers to this function must disable preemption.
 */
-static void rcu_preempt_qs(void)
+static void rcu_qs(void)
 {
-	RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n");
+	RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
 	if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
 		trace_rcu_grace_period(TPS("rcu_preempt"),
 				       __this_cpu_read(rcu_data_p->gp_seq),
 				       TPS("cpuqs"));
 		__this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false);
-		barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
+		barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
 		current->rcu_read_unlock_special.b.need_qs = false;
 	}
 }
@@ -332,12 +328,14 @@ static void rcu_preempt_qs(void)
 *
 * Caller must disable interrupts.
 */
-static void rcu_preempt_note_context_switch(bool preempt)
+void rcu_note_context_switch(bool preempt)
 {
 	struct task_struct *t = current;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;

+	barrier(); /* Avoid RCU read-side critical sections leaking down. */
+	trace_rcu_utilization(TPS("Start context switch"));
 	lockdep_assert_irqs_disabled();
 	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
 	if (t->rcu_read_lock_nesting > 0 &&
@@ -385,8 +383,11 @@ static void rcu_preempt_note_context_switch(bool preempt)
 	 * grace period, then the fact that the task has been enqueued
 	 * means that we continue to block the current grace period.
 	 */
-	rcu_preempt_qs();
+	rcu_qs();
+	trace_rcu_utilization(TPS("End context switch"));
+	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);

 /*
 * Check for preempted RCU readers blocking the current grace period
@@ -495,7 +496,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		return;
 	}
 	if (special.b.need_qs) {
-		rcu_preempt_qs();
+		rcu_qs();
 		t->rcu_read_unlock_special.b.need_qs = false;
 		if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) {
 			local_irq_restore(flags);
@@ -600,7 +601,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 */
 static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 {
-	return (this_cpu_ptr(&rcu_preempt_data)->deferred_qs ||
+	return (this_cpu_ptr(&rcu_data)->deferred_qs ||
 		READ_ONCE(t->rcu_read_unlock_special.s)) &&
 	       !t->rcu_read_lock_nesting;
 }
@@ -779,17 +780,21 @@ rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp)
 }

 /*
- * Check for a quiescent state from the current CPU.  When a task blocks,
- * the task is recorded in the corresponding CPU's rcu_node structure,
- * which is checked elsewhere.
- *
- * Caller must disable hard irqs.
+ * Check for a quiescent state from the current CPU, including voluntary
+ * context switches for Tasks RCU.  When a task blocks, the task is
+ * recorded in the corresponding CPU's rcu_node structure, which is checked
+ * elsewhere, hence this function need only check for quiescent states
+ * related to the current CPU, not to those related to tasks.
 */
-static void rcu_preempt_check_callbacks(void)
+static void rcu_flavor_sched_clock_irq(int user)
 {
-	struct rcu_state *rsp = &rcu_preempt_state;
+	struct rcu_state *rsp = &rcu_state;
 	struct task_struct *t = current;

+	if (user || rcu_is_cpu_rrupt_from_idle()) {
+		rcu_note_voluntary_context_switch(current);
+	}
+
 	if (t->rcu_read_lock_nesting > 0 ||
 	    (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
 		/* No QS, force context switch if deferred. */
@@ -799,7 +804,7 @@ static void rcu_preempt_check_callbacks(void)
 		rcu_preempt_deferred_qs(t); /* Report deferred QS. */
 		return;
 	} else if (!t->rcu_read_lock_nesting) {
-		rcu_preempt_qs(); /* Report immediate QS. */
+		rcu_qs(); /* Report immediate QS. */
 		return;
 	}

@@ -812,44 +817,6 @@ static void rcu_preempt_check_callbacks(void)
 		t->rcu_read_unlock_special.b.need_qs = true;
 }

-/**
- * call_rcu() - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all pre-existing RCU read-side
- * critical sections have completed.  However, the callback function
- * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing RCU read-side critical section.  On systems with more
- * than one CPU, this means that when "func()" is invoked, each CPU is
- * guaranteed to have executed a full memory barrier since the end of its
- * last RCU read-side critical section whose beginning preceded the call
- * to call_rcu().  It also means that each CPU executing an RCU read-side
- * critical section that continues beyond the start of "func()" must have
- * executed a memory barrier after the call_rcu() but before the beginning
- * of that RCU read-side critical section.  Note that these guarantees
- * include CPUs that are offline, idle, or executing in user mode, as
- * well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting RCU callback function "func()", then both CPU A and CPU B are
- * guaranteed to execute a full memory barrier during the time interval
- * between the call to call_rcu() and the invocation of "func()" -- even
- * if CPU A and CPU B are the same CPU (but again only if the system has
- * more than one CPU).
- */
-void call_rcu(struct rcu_head *head, rcu_callback_t func)
-{
-	__call_rcu(head, func, rcu_state_p, -1, 0);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
 /**
 * synchronize_rcu - wait until a grace period has elapsed.
 *
@@ -860,14 +827,28 @@ EXPORT_SYMBOL_GPL(call_rcu);
 * concurrently with new RCU read-side critical sections that began while
 * synchronize_rcu() was waiting.  RCU read-side critical sections are
 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
+ * In addition, regions of code across which interrupts, preemption, or
+ * softirqs have been disabled also serve as RCU read-side critical
+ * sections.  This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
 *
- * See the description of synchronize_sched() for more detailed
- * information on memory-ordering guarantees.  However, please note
- * that -only- the memory-ordering guarantees apply.  For example,
- * synchronize_rcu() is -not- guaranteed to wait on things like code
- * protected by preempt_disable(), instead, synchronize_rcu() is -only-
- * guaranteed to wait on RCU read-side critical sections, that is, sections
- * of code protected by rcu_read_lock().
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_rcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since the
+ * end of its last RCU-sched read-side critical section whose beginning
+ * preceded the call to synchronize_rcu().  In addition, each CPU having
+ * an RCU read-side critical section that extends beyond the return from
+ * synchronize_rcu() is guaranteed to have executed a full memory barrier
+ * after the beginning of synchronize_rcu() and before the beginning of
+ * that RCU read-side critical section.  Note that these guarantees include
+ * CPUs that are offline, idle, or executing in user mode, as well as CPUs
+ * that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_rcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
 */
 void synchronize_rcu(void)
 {
@@ -884,28 +865,6 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);

-/**
- * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
- *
- * Note that this primitive does not necessarily wait for an RCU grace period
- * to complete.  For example, if there are no RCU callbacks queued anywhere
- * in the system, then rcu_barrier() is within its rights to return
- * immediately, without waiting for anything, much less an RCU grace period.
- */
-void rcu_barrier(void)
-{
-	_rcu_barrier(rcu_state_p);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
-/*
- * Initialize preemptible RCU's state structures.
- */
-static void __init __rcu_init_preempt(void)
-{
-	rcu_init_one(rcu_state_p);
-}
-
 /*
 * Check for a task exiting while in a preemptible-RCU read-side
 * critical section, clean up if so.  No need to issue warnings,
@@ -969,8 +928,6 @@ dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck)

 #else /* #ifdef CONFIG_PREEMPT_RCU */

-static struct rcu_state *const rcu_state_p = &rcu_sched_state;
-
 /*
 * Tell them what RCU they are running.
 */
@@ -980,18 +937,48 @@ static void __init rcu_bootup_announce(void)
 	rcu_bootup_announce_oddness();
 }

-/* Because preemptible RCU does not exist, we can ignore its QSes. */
-static void rcu_preempt_qs(void)
+/*
+ * Note a quiescent state for PREEMPT=n.  Because we do not need to know
+ * how many quiescent states passed, just if there was at least one since
+ * the start of the grace period, this just sets a flag.  The caller must
+ * have disabled preemption.
+ */
+static void rcu_qs(void)
 {
+	RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!");
+	if (!__this_cpu_read(rcu_data.cpu_no_qs.s))
+		return;
+	trace_rcu_grace_period(TPS("rcu_sched"),
+			       __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
+	__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
+	if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
+		return;
+	__this_cpu_write(rcu_data.cpu_no_qs.b.exp, false);
+	rcu_report_exp_rdp(&rcu_state, this_cpu_ptr(&rcu_data));
 }

 /*
- * Because preemptible RCU does not exist, we never have to check for
- * CPUs being in quiescent states.
+ * Note a PREEMPT=n context switch.  The caller must have disabled interrupts.
 */
-static void rcu_preempt_note_context_switch(bool preempt)
+void rcu_note_context_switch(bool preempt)
 {
+	barrier(); /* Avoid RCU read-side critical sections leaking down. */
+	trace_rcu_utilization(TPS("Start context switch"));
+	rcu_qs();
+	/* Load rcu_urgent_qs before other flags. */
+	if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
+		goto out;
+	this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
+	if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
+		rcu_momentary_dyntick_idle();
+	this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
+	if (!preempt)
+		rcu_tasks_qs(current);
+out:
+	trace_rcu_utilization(TPS("End context switch"));
+	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);

 /*
 * Because preemptible RCU does not exist, there are never any preempted
@@ -1059,29 +1046,44 @@ rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp)
 }

 /*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to check.
+ * Check to see if this CPU is in a non-context-switch quiescent state,
+ * namely user mode and idle loop.
 */
-static void rcu_preempt_check_callbacks(void)
+static void rcu_flavor_sched_clock_irq(int user)
 {
+	if (user || rcu_is_cpu_rrupt_from_idle()) {
+
+		/*
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so note it.
+		 *
+		 * No memory barrier is required here because rcu_qs()
+		 * references only CPU-local variables that other CPUs
+		 * neither access nor modify, at least not while the
+		 * corresponding CPU is online.
+		 */
+
+		rcu_qs();
+	}
 }

-/*
- * Because preemptible RCU does not exist, rcu_barrier() is just
- * another name for rcu_barrier_sched().
- */
-void rcu_barrier(void)
-{
-	rcu_barrier_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
-/*
- * Because preemptible RCU does not exist, it need not be initialized.
- */
-static void __init __rcu_init_preempt(void)
+/* PREEMPT=n implementation of synchronize_rcu(). */
+void synchronize_rcu(void)
 {
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_rcu() in RCU-sched read-side critical section");
+	if (rcu_blocking_is_gp())
+		return;
+	if (rcu_gp_is_expedited())
+		synchronize_rcu_expedited();
+	else
+		wait_rcu_gp(call_rcu);
 }
+EXPORT_SYMBOL_GPL(synchronize_rcu);

 /*
 * Because preemptible RCU does not exist, tasks cannot possibly exit
@@ -1324,9 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,

 static void rcu_kthread_do_work(void)
 {
-	rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
-	rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
-	rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+	rcu_do_batch(&rcu_state, this_cpu_ptr(&rcu_data));
 }

 static void rcu_cpu_kthread_setup(unsigned int cpu)
@@ -1733,87 +1733,6 @@ static void rcu_idle_count_callbacks_posted(void)
 	__this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
 }

-/*
- * Data for flushing lazy RCU callbacks at OOM time.
- */
-static atomic_t oom_callback_count;
-static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
-
-/*
- * RCU OOM callback -- decrement the outstanding count and deliver the
- * wake-up if we are the last one.
- */
-static void rcu_oom_callback(struct rcu_head *rhp)
-{
-	if (atomic_dec_and_test(&oom_callback_count))
-		wake_up(&oom_callback_wq);
-}
-
-/*
- * Post an rcu_oom_notify callback on the current CPU if it has at
- * least one lazy callback.  This will unnecessarily post callbacks
- * to CPUs that already have a non-lazy callback at the end of their
- * callback list, but this is an infrequent operation, so accept some
- * extra overhead to keep things simple.
- */
-static void rcu_oom_notify_cpu(void *unused)
-{
-	struct rcu_state *rsp;
-	struct rcu_data *rdp;
-
-	for_each_rcu_flavor(rsp) {
-		rdp = raw_cpu_ptr(rsp->rda);
-		if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) {
-			atomic_inc(&oom_callback_count);
-			rsp->call(&rdp->oom_head, rcu_oom_callback);
-		}
-	}
-}
-
-/*
- * If low on memory, ensure that each CPU has a non-lazy callback.
- * This will wake up CPUs that have only lazy callbacks, in turn
- * ensuring that they free up the corresponding memory in a timely manner.
- * Because an uncertain amount of memory will be freed in some uncertain
- * timeframe, we do not claim to have freed anything.
- */
-static int rcu_oom_notify(struct notifier_block *self,
-			  unsigned long notused, void *nfreed)
-{
-	int cpu;
-
-	/* Wait for callbacks from earlier instance to complete. */
-	wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
-	smp_mb(); /* Ensure callback reuse happens after callback invocation. */
-
-	/*
-	 * Prevent premature wakeup: ensure that all increments happen
-	 * before there is a chance of the counter reaching zero.
-	 */
-	atomic_set(&oom_callback_count, 1);
-
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
-		cond_resched_tasks_rcu_qs();
-	}
-
-	/* Unconditionally decrement: no need to wake ourselves up. */
-	atomic_dec(&oom_callback_count);
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block rcu_oom_nb = {
-	.notifier_call = rcu_oom_notify
-};
-
-static int __init rcu_register_oom_notifier(void)
-{
-	register_oom_notifier(&rcu_oom_nb);
-	return 0;
-}
-early_initcall(rcu_register_oom_notifier);
-
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */

 #ifdef CONFIG_RCU_FAST_NO_HZ
@@ -1942,11 +1861,22 @@ static void increment_cpu_stall_ticks(void)
 */


-/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
+ * comma-separated list of CPUs and/or CPU ranges.  If an invalid list is
+ * given, a warning is emitted and all CPUs are offloaded.
+ */
 static int __init rcu_nocb_setup(char *str)
 {
 	alloc_bootmem_cpumask_var(&rcu_nocb_mask);
-	cpulist_parse(str, rcu_nocb_mask);
+	if (!strcasecmp(str, "all"))
+		cpumask_setall(rcu_nocb_mask);
+	else
+		if (cpulist_parse(str, rcu_nocb_mask)) {
+			pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+			cpumask_setall(rcu_nocb_mask);
+		}
 	return 1;
 }
 __setup("rcu_nocbs=", rcu_nocb_setup);
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -553,11 +553,16 @@ static void test_callback(struct rcu_head *r)
 	pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
 }

+DEFINE_STATIC_SRCU(early_srcu);
+
 static void early_boot_test_call_rcu(void)
 {
 	static struct rcu_head head;
+	static struct rcu_head shead;

 	call_rcu(&head, test_callback);
+	if (IS_ENABLED(CONFIG_SRCU))
+		call_srcu(&early_srcu, &shead, test_callback);
 }

 static void early_boot_test_call_rcu_bh(void)
@@ -595,6 +600,10 @@ static int rcu_verify_early_boot_tests(void)
 	if (rcu_self_test) {
 		early_boot_test_counter++;
 		rcu_barrier();
+		if (IS_ENABLED(CONFIG_SRCU)) {
+			early_boot_test_counter++;
+			srcu_barrier(&early_srcu);
+		}
 	}
 	if (rcu_self_test_bh) {
 		early_boot_test_counter++;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -423,7 +423,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 	 * state, even in the failed case, an explicit smp_mb() must be used.
 	 */
 	smp_mb__before_atomic();
-	if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
+	if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
 		return;

 	head->count++;
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -307,7 +307,6 @@ restart:
 		pending >>= softirq_bit;
 	}

-	rcu_bh_qs();
 	if (__this_cpu_read(ksoftirqd) == current)
 		rcu_softirq_qs();
 	local_irq_disable();
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -726,7 +726,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
 static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
 								ktime_t now)
 {
-	struct task_struct *task = (struct task_struct *)alarm->data;
+	struct task_struct *task = alarm->data;

 	alarm->data = NULL;
 	if (task)
@@ -822,7 +822,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 	struct restart_block *restart = &current->restart_block;
 	struct alarm alarm;
 	ktime_t exp;
-	int ret = 0;
+	int ret;

 	if (!alarmtimer_get_rtcdev())
 		return -EOPNOTSUPP;
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1773,7 +1773,7 @@ void update_process_times(int user_tick)
 	/* Note: this timer irq context must be accounted for as well. */
 	account_process_tick(p, user_tick);
 	run_local_timers();
-	rcu_check_callbacks(user_tick);
+	rcu_sched_clock_irq(user_tick);
 #ifdef CONFIG_IRQ_WORK
 	if (in_irq())
 		irq_work_tick();
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4345,7 +4345,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 	cpumask_var_t tracing_cpumask_new;
 	int err, cpu;

-	if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
 		return -ENOMEM;

 	err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1982,11 +1982,11 @@ static bool check_new_pcp(struct page *page)
 	return check_new_page(page);
 }
 #else
-static bool check_pcp_refill(struct page *page)
+static inline bool check_pcp_refill(struct page *page)
 {
-	return check_new_page(page);
+	return false;
 }
-static bool check_new_pcp(struct page *page)
+static inline bool check_new_pcp(struct page *page)
 {
 	return false;
 }
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2324,6 +2324,10 @@ void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
 {
 	__kmem_cache_shrink(cachep);
 }
+
+void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
+{
+}
 #endif

 int __kmem_cache_shutdown(struct kmem_cache *cachep)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -172,6 +172,7 @@ int __kmem_cache_shutdown(struct kmem_cache *);
 void __kmem_cache_release(struct kmem_cache *);
 int __kmem_cache_shrink(struct kmem_cache *);
 void __kmemcg_cache_deactivate(struct kmem_cache *s);
+void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
 void slab_kmem_cache_release(struct kmem_cache *);

 struct seq_file;
@@ -294,8 +295,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,

 extern void slab_init_memcg_params(struct kmem_cache *);
 extern void memcg_link_cache(struct kmem_cache *s);
-extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
-				void (*deact_fn)(struct kmem_cache *));

 #else /* CONFIG_MEMCG_KMEM */

--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -682,7 +682,7 @@ static void kmemcg_deactivate_workfn(struct work_struct *work)
 	put_online_mems();
 	put_online_cpus();

-	/* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
+	/* done, put the ref from kmemcg_cache_deactivate() */
 	css_put(&s->memcg_params.memcg->css);
 }

@@ -700,19 +700,7 @@ static void kmemcg_deactivate_rcufn(struct rcu_head *head)
 	queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
 }

-/**
- * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
- *					   sched RCU grace period
- * @s: target kmem_cache
- * @deact_fn: deactivation function to call
- *
- * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
- * held after a sched RCU grace period.  The slab is guaranteed to stay
- * alive until @deact_fn is finished.  This is to be used from
- * __kmemcg_cache_deactivate().
- */
-void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
-					   void (*deact_fn)(struct kmem_cache *))
+static void kmemcg_cache_deactivate(struct kmem_cache *s)
 {
 	if (WARN_ON_ONCE(is_root_cache(s)) ||
 	    WARN_ON_ONCE(s->memcg_params.deact_fn))
@@ -727,6 +715,8 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
 	if (s->memcg_params.root_cache->memcg_params.dying)
 		goto unlock;

+	__kmemcg_cache_deactivate(s);
+
 	/* pin memcg so that @s doesn't get destroyed in the middle */
 	css_get(&s->memcg_params.memcg->css);

@@ -755,7 +745,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 		if (!c)
 			continue;

-		__kmemcg_cache_deactivate(c);
+		kmemcg_cache_deactivate(c);
 		arr->entries[idx] = NULL;
 	}
 	mutex_unlock(&slab_mutex);
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4087,7 +4087,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 }

 #ifdef CONFIG_MEMCG
-static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
+void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
 {
 	/*
 	 * Called with all the locks held after a sched RCU grace period.
@@ -4113,12 +4113,6 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
 	 */
 	slub_set_cpu_partial(s, 0);
 	s->min_partial = 0;
-
-	/*
-	 * s->cpu_partial is checked locklessly (see put_cpu_partial), so
-	 * we have to make sure the change is visible before shrinking.
-	 */
-	slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
 }
 #endif

--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -177,7 +177,6 @@ config CFG80211_DEBUGFS

 config CFG80211_CRDA_SUPPORT
 	bool "support CRDA" if EXPERT
-	default y
 	help
 	  You should enable this option unless you know for sure you have no
 	  need for it, for example when using internal regdb (above) or the