Files
kernel_nothing_sm7325/drivers/md/raid5-cache.c
Raghavendra Rao Ananta 5bd75403be Merge remote-tracking branch 'remotes/origin/tmp-f686d9f' into msm-lahaina
* remotes/origin/tmp-f686d9f:
  ANDROID: update abi_gki_aarch64.xml for 5.2-rc6
  Linux 5.2-rc6
  Revert "iommu/vt-d: Fix lock inversion between iommu->lock and device_domain_lock"
  Bluetooth: Fix regression with minimum encryption key size alignment
  tcp: refine memory limit test in tcp_fragment()
  x86/vdso: Prevent segfaults due to hoisted vclock reads
  SUNRPC: Fix a credential refcount leak
  Revert "SUNRPC: Declare RPC timers as TIMER_DEFERRABLE"
  net :sunrpc :clnt :Fix xps refcount imbalance on the error path
  NFS4: Only set creation opendata if O_CREAT
  ANDROID: gki_defconfig: workaround to enable configs
  ANDROID: gki_defconfig: more configs for partners
  ARM: 8867/1: vdso: pass --be8 to linker if necessary
  KVM: nVMX: reorganize initial steps of vmx_set_nested_state
  KVM: PPC: Book3S HV: Invalidate ERAT when flushing guest TLB entries
  habanalabs: use u64_to_user_ptr() for reading user pointers
  nfsd: replace Jeff by Chuck as nfsd co-maintainer
  inet: clear num_timeout reqsk_alloc()
  PCI/P2PDMA: Ignore root complex whitelist when an IOMMU is present
  net: mvpp2: debugfs: Add pmap to fs dump
  ipv6: Default fib6_type to RTN_UNICAST when not set
  net: hns3: Fix inconsistent indenting
  net/af_iucv: always register net_device notifier
  net/af_iucv: build proper skbs for HiperTransport
  net/af_iucv: remove GFP_DMA restriction for HiperTransport
  doc: fix documentation about UIO_MEM_LOGICAL using
  MAINTAINERS / Documentation: Thorsten Scherer is the successor of Gavin Schenk
  docs: fb: Add TER16x32 to the available font names
  MAINTAINERS: fpga: hand off maintainership to Moritz
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 507
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 506
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 505
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 504
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 503
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 502
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 501
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 500
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 499
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 498
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 497
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 496
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 495
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 491
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 490
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 489
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 488
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 487
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 486
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 485
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 484
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 482
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 481
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 480
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 479
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 477
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 475
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 474
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 473
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 472
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 471
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 469
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 468
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 467
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 466
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 465
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 464
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 463
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 462
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 461
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 460
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 459
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 457
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 456
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 455
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 454
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 452
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 451
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 250
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 248
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 247
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 246
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 245
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 244
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 243
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 239
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 238
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 237
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 235
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 234
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 233
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 232
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 231
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 230
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 226
  KVM: arm/arm64: Fix emulated ptimer irq injection
  net: dsa: mv88e6xxx: fix shift of FID bits in mv88e6185_g1_vtu_loadpurge()
  tests: kvm: Check for a kernel warning
  kvm: tests: Sort tests in the Makefile alphabetically
  KVM: x86/mmu: Allocate PAE root array when using SVM's 32-bit NPT
  KVM: x86: Modify struct kvm_nested_state to have explicit fields for data
  fanotify: update connector fsid cache on add mark
  quota: fix a problem about transfer quota
  drm/i915: Don't clobber M/N values during fastset check
  powerpc: enable a 30-bit ZONE_DMA for 32-bit pmac
  ovl: make i_ino consistent with st_ino in more cases
  scsi: qla2xxx: Fix hardlockup in abort command during driver remove
  scsi: ufs: Avoid runtime suspend possibly being blocked forever
  scsi: qedi: update driver version to 8.37.0.20
  scsi: qedi: Check targetname while finding boot target information
  hvsock: fix epollout hang from race condition
  net/udp_gso: Allow TX timestamp with UDP GSO
  net: netem: fix use after free and double free with packet corruption
  net: netem: fix backlog accounting for corrupted GSO frames
  net: lio_core: fix potential sign-extension overflow on large shift
  tipc: pass tunnel dev as NULL to udp_tunnel(6)_xmit_skb
  ip6_tunnel: allow not to count pkts on tstats by passing dev as NULL
  ip_tunnel: allow not to count pkts on tstats by setting skb's dev to NULL
  apparmor: reset pos on failure to unpack for various functions
  apparmor: enforce nullbyte at end of tag string
  apparmor: fix PROFILE_MEDIATES for untrusted input
  RDMA/efa: Handle mmap insertions overflow
  tun: wake up waitqueues after IFF_UP is set
  drm: return -EFAULT if copy_to_user() fails
  net: remove duplicate fetch in sock_getsockopt
  tipc: fix issues with early FAILOVER_MSG from peer
  bnx2x: Check if transceiver implements DDM before access
  xhci: detect USB 3.2 capable host controllers correctly
  usb: xhci: Don't try to recover an endpoint if port is in error state.
  KVM: fix typo in documentation
  drm/panfrost: Make sure a BO is only unmapped when appropriate
  md: fix for divide error in status_resync
  soc: ixp4xx: npe: Fix an IS_ERR() vs NULL check in probe
  arm64/mm: don't initialize pgd_cache twice
  MAINTAINERS: Update my email address
  arm64/sve: <uapi/asm/ptrace.h> should not depend on <uapi/linux/prctl.h>
  ovl: fix typo in MODULE_PARM_DESC
  ovl: fix bogus -Wmaybe-unitialized warning
  ovl: don't fail with disconnected lower NFS
  mmc: core: Prevent processing SDIO IRQs when the card is suspended
  mmc: sdhci: sdhci-pci-o2micro: Correctly set bus width when tuning
  brcmfmac: sdio: Don't tune while the card is off
  mmc: core: Add sdio_retune_hold_now() and sdio_retune_release()
  brcmfmac: sdio: Disable auto-tuning around commands expected to fail
  mmc: core: API to temporarily disable retuning for SDIO CRC errors
  Revert "brcmfmac: disable command decode in sdio_aos"
  ARM: ixp4xx: include irqs.h where needed
  ARM: ixp4xx: mark ixp4xx_irq_setup as __init
  ARM: ixp4xx: don't select SERIAL_OF_PLATFORM
  firmware: trusted_foundations: add ARMv7 dependency
  usb: dwc2: Use generic PHY width in params setup
  RDMA/efa: Fix success return value in case of error
  IB/hfi1: Handle port down properly in pio
  IB/hfi1: Handle wakeup of orphaned QPs for pio
  IB/hfi1: Wakeup QPs orphaned on wait list after flush
  IB/hfi1: Use aborts to trigger RC throttling
  IB/hfi1: Create inline to get extended headers
  IB/hfi1: Silence txreq allocation warnings
  IB/hfi1: Avoid hardlockup with flushlist_lock
  KVM: PPC: Book3S HV: Only write DAWR[X] when handling h_set_dawr in real mode
  KVM: PPC: Book3S HV: Fix r3 corruption in h_set_dabr()
  fs/namespace: fix unprivileged mount propagation
  vfs: fsmount: add missing mntget()
  cifs: fix GlobalMid_Lock bug in cifs_reconnect
  SMB3: retry on STATUS_INSUFFICIENT_RESOURCES instead of failing write
  staging: erofs: add requirements field in superblock
  arm64: ssbd: explicitly depend on <linux/prctl.h>
  block: fix page leak when merging to same page
  block: return from __bio_try_merge_page if merging occured in the same page
  Btrfs: fix failure to persist compression property xattr deletion on fsync
  riscv: remove unused barrier defines
  usb: chipidea: udc: workaround for endpoint conflict issue
  MAINTAINERS: Change QCOM repo location
  mmc: mediatek: fix SDIO IRQ detection issue
  mmc: mediatek: fix SDIO IRQ interrupt handle flow
  mmc: core: complete HS400 before checking status
  riscv: mm: synchronize MMU after pte change
  MAINTAINERS: Update my email address to use @kernel.org
  ANDROID: update abi_gki_aarch64.xml for 5.2-rc5
  riscv: dts: add initial board data for the SiFive HiFive Unleashed
  riscv: dts: add initial support for the SiFive FU540-C000 SoC
  dt-bindings: riscv: convert cpu binding to json-schema
  dt-bindings: riscv: sifive: add YAML documentation for the SiFive FU540
  arch: riscv: add support for building DTB files from DT source data
  drm/i915/gvt: ignore unexpected pvinfo write
  lapb: fixed leak of control-blocks.
  tipc: purge deferredq list for each grp member in tipc_group_delete
  ax25: fix inconsistent lock state in ax25_destroy_timer
  neigh: fix use-after-free read in pneigh_get_next
  tcp: fix compile error if !CONFIG_SYSCTL
  hv_sock: Suppress bogus "may be used uninitialized" warnings
  be2net: Fix number of Rx queues used for flow hashing
  net: handle 802.1P vlan 0 packets properly
  Linux 5.2-rc5
  tcp: enforce tcp_min_snd_mss in tcp_mtu_probing()
  tcp: add tcp_min_snd_mss sysctl
  tcp: tcp_fragment() should apply sane memory limits
  tcp: limit payload size of sacked skbs
  Revert "net: phylink: set the autoneg state in phylink_phy_change"
  bpf: fix nested bpf tracepoints with per-cpu data
  bpf: Fix out of bounds memory access in bpf_sk_storage
  vsock/virtio: set SOCK_DONE on peer shutdown
  net: dsa: rtl8366: Fix up VLAN filtering
  net: phylink: set the autoneg state in phylink_phy_change
  powerpc/32: fix build failure on book3e with KVM
  powerpc/booke: fix fast syscall entry on SMP
  powerpc/32s: fix initial setup of segment registers on secondary CPU
  x86/microcode, cpuhotplug: Add a microcode loader CPU hotplug callback
  net: add high_order_alloc_disable sysctl/static key
  tcp: add tcp_tx_skb_cache sysctl
  tcp: add tcp_rx_skb_cache sysctl
  sysctl: define proc_do_static_key()
  hv_netvsc: Set probe mode to sync
  net: sched: flower: don't call synchronize_rcu() on mask creation
  net: dsa: fix warning same module names
  sctp: Free cookie before we memdup a new one
  net: dsa: microchip: Don't try to read stats for unused ports
  qmi_wwan: extend permitted QMAP mux_id value range
  qmi_wwan: avoid RCU stalls on device disconnect when in QMAP mode
  qmi_wwan: add network device usage statistics for qmimux devices
  qmi_wwan: add support for QMAP padding in the RX path
  bpf, x64: fix stack layout of JITed bpf code
  Smack: Restore the smackfsdef mount option and add missing prefixes
  bpf, devmap: Add missing RCU read lock on flush
  bpf, devmap: Add missing bulk queue free
  bpf, devmap: Fix premature entry free on destroying map
  ftrace: Fix NULL pointer dereference in free_ftrace_func_mapper()
  module: Fix livepatch/ftrace module text permissions race
  tracing/uprobe: Fix obsolete comment on trace_uprobe_create()
  tracing/uprobe: Fix NULL pointer dereference in trace_uprobe_create()
  tracing: Make two symbols static
  tracing: avoid build warning with HAVE_NOP_MCOUNT
  tracing: Fix out-of-range read in trace_stack_print()
  gfs2: Fix rounding error in gfs2_iomap_page_prepare
  net: phylink: further mac_config documentation improvements
  nfc: Ensure presence of required attributes in the deactivate_target handler
  btrfs: start readahead also in seed devices
  x86/kasan: Fix boot with 5-level paging and KASAN
  cfg80211: report measurement start TSF correctly
  cfg80211: fix memory leak of wiphy device name
  cfg80211: util: fix bit count off by one
  mac80211: do not start any work during reconfigure flow
  cfg80211: use BIT_ULL in cfg80211_parse_mbssid_data()
  mac80211: only warn once on chanctx_conf being NULL
  mac80211: drop robust management frames from unknown TA
  gpu: ipu-v3: image-convert: Fix image downsize coefficients
  gpu: ipu-v3: image-convert: Fix input bytesperline for packed formats
  gpu: ipu-v3: image-convert: Fix input bytesperline width/height align
  thunderbolt: Implement CIO reset correctly for Titan Ridge
  ARM: davinci: da8xx: specify dma_coherent_mask for lcdc
  ARM: davinci: da850-evm: call regulator_has_full_constraints()
  timekeeping: Repair ktime_get_coarse*() granularity
  Revert "ALSA: hda/realtek - Improve the headset mic for Acer Aspire laptops"
  ANDROID: update abi_gki_aarch64.xml
  mm/devm_memremap_pages: fix final page put race
  PCI/P2PDMA: track pgmap references per resource, not globally
  lib/genalloc: introduce chunk owners
  PCI/P2PDMA: fix the gen_pool_add_virt() failure path
  mm/devm_memremap_pages: introduce devm_memunmap_pages
  drivers/base/devres: introduce devm_release_action()
  mm/vmscan.c: fix trying to reclaim unevictable LRU page
  coredump: fix race condition between collapse_huge_page() and core dumping
  mm/mlock.c: change count_mm_mlocked_page_nr return type
  mm: mmu_gather: remove __tlb_reset_range() for force flush
  fs/ocfs2: fix race in ocfs2_dentry_attach_lock()
  mm/vmscan.c: fix recent_rotated history
  mm/mlock.c: mlockall error for flag MCL_ONFAULT
  scripts/decode_stacktrace.sh: prefix addr2line with $CROSS_COMPILE
  mm/list_lru.c: fix memory leak in __memcg_init_list_lru_node
  mm: memcontrol: don't batch updates of local VM stats and events
  PCI: PM: Skip devices in D0 for suspend-to-idle
  ANDROID: Removed extraneous configs from gki
  powerpc/bpf: use unsigned division instruction for 64-bit operations
  bpf: fix div64 overflow tests to properly detect errors
  bpf: sync BPF_FIB_LOOKUP flag changes with BPF uapi
  bpf: simplify definition of BPF_FIB_LOOKUP related flags
  cifs: add spinlock for the openFileList to cifsInodeInfo
  cifs: fix panic in smb2_reconnect
  x86/fpu: Don't use current->mm to check for a kthread
  KVM: nVMX: use correct clean fields when copying from eVMCS
  vfio-ccw: Destroy kmem cache region on module exit
  block/ps3vram: Use %llu to format sector_t after LBDAF removal
  libata: Extend quirks for the ST1000LM024 drives with NOLPM quirk
  bcache: only set BCACHE_DEV_WB_RUNNING when cached device attached
  bcache: fix stack corruption by PRECEDING_KEY()
  arm64/sve: Fix missing SVE/FPSIMD endianness conversions
  blk-mq: remove WARN_ON(!q->elevator) from blk_mq_sched_free_requests
  blkio-controller.txt: Remove references to CFQ
  block/switching-sched.txt: Update to blk-mq schedulers
  null_blk: remove duplicate check for report zone
  blk-mq: no need to check return value of debugfs_create functions
  io_uring: fix memory leak of UNIX domain socket inode
  block: force select mq-deadline for zoned block devices
  binder: fix possible UAF when freeing buffer
  drm/amdgpu: return 0 by default in amdgpu_pm_load_smu_firmware
  drm/amdgpu: Fix bounds checking in amdgpu_ras_is_supported()
  ANDROID: x86 gki_defconfig: enable DMA_CMA
  ANDROID: Fixed x86 regression
  ANDROID: gki_defconfig: enable DMA_CMA
  Input: synaptics - enable SMBus on ThinkPad E480 and E580
  net: mvpp2: prs: Use the correct helpers when removing all VID filters
  net: mvpp2: prs: Fix parser range for VID filtering
  mlxsw: spectrum: Disallow prio-tagged packets when PVID is removed
  mlxsw: spectrum_buffers: Reduce pool size on Spectrum-2
  selftests: tc_flower: Add TOS matching test
  mlxsw: spectrum_flower: Fix TOS matching
  selftests: mlxsw: Test nexthop offload indication
  mlxsw: spectrum_router: Refresh nexthop neighbour when it becomes dead
  mlxsw: spectrum: Use different seeds for ECMP and LAG hash
  net: tls, correctly account for copied bytes with multiple sk_msgs
  vrf: Increment Icmp6InMsgs on the original netdev
  cpuset: restore sanity to cpuset_cpus_allowed_fallback()
  net: ethtool: Allow matching on vlan DEI bit
  linux-next: DOC: RDS: Fix a typo in rds.txt
  x86/kgdb: Return 0 from kgdb_arch_set_breakpoint()
  mpls: fix af_mpls dependencies for real
  selinux: fix a missing-check bug in selinux_sb_eat_lsm_opts()
  selinux: fix a missing-check bug in selinux_add_mnt_opt( )
  arm64: tlbflush: Ensure start/end of address range are aligned to stride
  usb: typec: Make sure an alt mode exist before getting its partner
  KVM: arm/arm64: vgic: Fix kvm_device leak in vgic_its_destroy
  KVM: arm64: Filter out invalid core register IDs in KVM_GET_REG_LIST
  KVM: arm64: Implement vq_present() as a macro
  xdp: check device pointer before clearing
  bpf: net: Set sk_bpf_storage back to NULL for cloned sk
  Btrfs: fix race between block group removal and block group allocation
  clocksource/drivers/arm_arch_timer: Don't trace count reader functions
  i2c: pca-platform: Fix GPIO lookup code
  thunderbolt: Make sure device runtime resume completes before taking domain lock
  drm: add fallback override/firmware EDID modes workaround
  i2c: acorn: fix i2c warning
  arm64: Don't unconditionally add -Wno-psabi to KBUILD_CFLAGS
  drm/edid: abstract override/firmware EDID retrieval
  platform/mellanox: mlxreg-hotplug: Add devm_free_irq call to remove flow
  platform/x86: mlx-platform: Fix parent device in i2c-mux-reg device registration
  platform/x86: intel-vbtn: Report switch events when event wakes device
  platform/x86: asus-wmi: Only Tell EC the OS will handle display hotkeys from asus_nb_wmi
  ARM: mvebu_v7_defconfig: fix Ethernet on Clearfog
  x86/resctrl: Prevent NULL pointer dereference when local MBM is disabled
  x86/resctrl: Don't stop walking closids when a locksetup group is found
  iommu/arm-smmu: Avoid constant zero in TLBI writes
  drm/i915/perf: fix whitelist on Gen10+
  drm/i915/sdvo: Implement proper HDMI audio support for SDVO
  drm/i915: Fix per-pixel alpha with CCS
  drm/i915/dmc: protect against reading random memory
  drm/i915/dsi: Use a fuzzy check for burst mode clock check
  Input: imx_keypad - make sure keyboard can always wake up system
  selinux: log raw contexts as untrusted strings
  ptrace: restore smp_rmb() in __ptrace_may_access()
  IB/hfi1: Correct tid qp rcd to match verbs context
  IB/hfi1: Close PSM sdma_progress sleep window
  IB/hfi1: Validate fault injection opcode user input
  geneve: Don't assume linear buffers in error handler
  vxlan: Don't assume linear buffers in error handler
  net: openvswitch: do not free vport if register_netdevice() is failed.
  net: correct udp zerocopy refcnt also when zerocopy only on append
  drm/amdgpu/{uvd,vcn}: fetch ring's read_ptr after alloc
  ovl: fix wrong flags check in FS_IOC_FS[SG]ETXATTR ioctls
  riscv: Fix udelay in RV32.
  drm/vmwgfx: fix a warning due to missing dma_parms
  riscv: export pm_power_off again
  drm/vmwgfx: Honor the sg list segment size limitation
  RISC-V: defconfig: enable clocks, serial console
  drm/vmwgfx: Use the backdoor port if the HB port is not available
  bpf: lpm_trie: check left child of last leftmost node for NULL
  Revert "fuse: require /dev/fuse reads to have enough buffer capacity"
  ALSA: ice1712: Check correct return value to snd_i2c_sendbytes (EWS/DMX 6Fire)
  ALSA: oxfw: allow PCM capture for Stanton SCS.1m
  ALSA: firewire-motu: fix destruction of data for isochronous resources
  s390/ctl_reg: mark __ctl_set_bit and __ctl_clear_bit as __always_inline
  s390/boot: disable address-of-packed-member warning
  ANDROID: update gki aarch64 ABI representation
  cgroup: Fix css_task_iter_advance_css_set() cset skip condition
  drm/panfrost: Require the simple_ondemand governor
  drm/panfrost: make devfreq optional again
  drm/gem_shmem: Use a writecombine mapping for ->vaddr
  mmc: sdhi: disallow HS400 for M3-W ES1.2, RZ/G2M, and V3H
  ASoC: Intel: sst: fix kmalloc call with wrong flags
  ASoC: core: Fix deadlock in snd_soc_instantiate_card()
  cgroup/bfq: revert bfq.weight symlink change
  ARM: dts: am335x phytec boards: Fix cd-gpios active level
  ARM: dts: dra72x: Disable usb4_tm target module
  nfp: ensure skb network header is set for packet redirect
  tcp: fix undo spurious SYNACK in passive Fast Open
  mpls: fix af_mpls dependencies
  ibmvnic: Fix unchecked return codes of memory allocations
  ibmvnic: Refresh device multicast list after reset
  ibmvnic: Do not close unopened driver during reset
  mpls: fix warning with multi-label encap
  net: phy: rename Asix Electronics PHY driver
  ipv6: flowlabel: fl6_sock_lookup() must use atomic_inc_not_zero
  net: ipv4: fib_semantics: fix uninitialized variable
  Input: iqs5xx - get axis info before calling input_mt_init_slots()
  Linux 5.2-rc4
  drm: panel-orientation-quirks: Add quirk for GPD MicroPC
  drm: panel-orientation-quirks: Add quirk for GPD pocket2
  counter/ftm-quaddec: Add missing dependencies in Kconfig
  staging: iio: adt7316: Fix build errors when GPIOLIB is not set
  x86/fpu: Update kernel's FPU state before using for the fsave header
  MAINTAINERS: Karthikeyan Ramasubramanian is MIA
  i2c: xiic: Add max_read_len quirk
  ANDROID: update ABI representation
  gpio: pca953x: hack to fix 24 bit gpio expanders
  net/mlx5e: Support tagged tunnel over bond
  net/mlx5e: Avoid detaching non-existing netdev under switchdev mode
  net/mlx5e: Fix source port matching in fdb peer flow rule
  net/mlx5e: Replace reciprocal_scale in TX select queue function
  net/mlx5e: Add ndo_set_feature for uplink representor
  net/mlx5: Avoid reloading already removed devices
  net/mlx5: Update pci error handler entries and command translation
  RAS/CEC: Convert the timer callback to a workqueue
  RAS/CEC: Fix binary search function
  x86/mm/KASLR: Compute the size of the vmemmap section properly
  can: purge socket error queue on sock destruct
  can: flexcan: Remove unneeded registration message
  can: af_can: Fix error path of can_init()
  can: m_can: implement errata "Needless activation of MRAF irq"
  can: mcp251x: add support for mcp25625
  dt-bindings: can: mcp251x: add mcp25625 support
  can: xilinx_can: use correct bittiming_const for CAN FD core
  can: flexcan: fix timeout when set small bitrate
  can: usb: Kconfig: Remove duplicate menu entry
  lockref: Limit number of cmpxchg loop retries
  uaccess: add noop untagged_addr definition
  x86/insn-eval: Fix use-after-free access to LDT entry
  kbuild: use more portable 'command -v' for cc-cross-prefix
  s390/unwind: correct stack switching during unwind
  scsi: hpsa: correct ioaccel2 chaining
  btrfs: Always trim all unallocated space in btrfs_trim_free_extents
  netfilter: ipv6: nf_defrag: accept duplicate fragments again
  powerpc/32s: fix booting with CONFIG_PPC_EARLY_DEBUG_BOOTX
  drm/meson: fix G12A primary plane disabling
  drm/meson: fix primary plane disabling
  drm/meson: fix G12A HDMI PLL settings for 4K60 1000/1001 variations
  block, bfq: add weight symlink to the bfq.weight cgroup parameter
  cgroup: let a symlink too be created with a cftype file
  powerpc/64s: __find_linux_pte() synchronization vs pmdp_invalidate()
  powerpc/64s: Fix THP PMD collapse serialisation
  powerpc: Fix kexec failure on book3s/32
  drm/nouveau/secboot/gp10[2467]: support newer FW to fix SEC2 failures on some boards
  drm/nouveau/secboot: enable loading of versioned LS PMU/SEC2 ACR msgqueue FW
  drm/nouveau/secboot: split out FW version-specific LS function pointers
  drm/nouveau/secboot: pass max supported FW version to LS load funcs
  drm/nouveau/core: support versioned firmware loading
  drm/nouveau/core: pass subdev into nvkm_firmware_get, rather than device
  block: free sched's request pool in blk_cleanup_queue
  bpf: expand section tests for test_section_names
  bpf: more msg_name rewrite tests to test_sock_addr
  bpf, bpftool: enable recvmsg attach types
  bpf, libbpf: enable recvmsg attach types
  bpf: sync tooling uapi header
  bpf: fix unconnected udp hooks
  vfio/mdev: Synchronize device create/remove with parent removal
  vfio/mdev: Avoid creating sysfs remove file on stale device removal
  pktgen: do not sleep with the thread lock held.
  net: mvpp2: Use strscpy to handle stat strings
  net: rds: fix memory leak in rds_ib_flush_mr_pool
  ipv6: fix EFAULT on sendto with icmpv6 and hdrincl
  ipv6: use READ_ONCE() for inet->hdrincl as in ipv4
  soundwire: intel: set dai min and max channels correctly
  soundwire: stream: fix bad unlock balance
  x86/fpu: Use fault_in_pages_writeable() for pre-faulting
  nvme-rdma: use dynamic dma mapping per command
  nvme: Fix u32 overflow in the number of namespace list calculation
  vfio/mdev: Improve the create/remove sequence
  SoC: rt274: Fix internal jack assignment in set_jack callback
  ALSA: hdac: fix memory release for SST and SOF drivers
  ASoC: SOF: Intel: hda: use the defined ppcap functions
  ASoC: core: move DAI pre-links initiation to snd_soc_instantiate_card
  ASoC: Intel: cht_bsw_rt5672: fix kernel oops with platform_name override
  ASoC: Intel: cht_bsw_nau8824: fix kernel oops with platform_name override
  ASoC: Intel: bytcht_es8316: fix kernel oops with platform_name override
  ASoC: Intel: cht_bsw_max98090: fix kernel oops with platform_name override
  Revert "gfs2: Replace gl_revokes with a GLF flag"
  arm64: Silence gcc warnings about arch ABI drift
  parisc: Fix crash due alternative coding for NP iopdir_fdc bit
  parisc: Use lpa instruction to load physical addresses in driver code
  parisc: configs: Remove useless UEVENT_HELPER_PATH
  parisc: Use implicit space register selection for loading the coherence index of I/O pdirs
  usb: gadget: udc: lpc32xx: fix return value check in lpc32xx_udc_probe()
  usb: gadget: dwc2: fix zlp handling
  usb: dwc2: Set actual frame number for completed ISOC transfer for none DDMA
  usb: gadget: udc: lpc32xx: allocate descriptor with GFP_ATOMIC
  usb: gadget: fusb300_udc: Fix memory leak of fusb300->ep[i]
  usb: phy: mxs: Disable external charger detect in mxs_phy_hw_init()
  usb: dwc2: Fix DMA cache alignment issues
  usb: dwc2: host: Fix wMaxPacketSize handling (fix webcam regression)
  ARM64: trivial: s/TIF_SECOMP/TIF_SECCOMP/ comment typo fix
  drm/komeda: Potential error pointer dereference
  drm/komeda: remove set but not used variable 'kcrtc'
  x86/CPU: Add more Icelake model numbers
  hwmon: (pmbus/core) Treat parameters as paged if on multiple pages
  hwmon: (pmbus/core) mutex_lock write in pmbus_set_samples
  hwmon: (core) add thermal sensors only if dev->of_node is present
  Revert "fib_rules: return 0 directly if an exactly same rule exists when NLM_F_EXCL not supplied"
  net: aquantia: fix wol configuration not applied sometimes
  ethtool: fix potential userspace buffer overflow
  Fix memory leak in sctp_process_init
  net: rds: fix memory leak when unload rds_rdma
  ipv6: fix the check before getting the cookie in rt6_get_cookie
  ipv4: not do cache for local delivery if bc_forwarding is enabled
  selftests: vm: Fix test build failure when built by itself
  tools: bpftool: Fix JSON output when lookup fails
  mmc: also set max_segment_size in the device
  mtip32xx: also set max_segment_size in the device
  rsxx: don't call dma_set_max_seg_size
  nvme-pci: don't limit DMA segement size
  s390/qeth: handle error when updating TX queue count
  s390/qeth: fix VLAN attribute in bridge_hostnotify udev event
  s390/qeth: check dst entry before use
  s390/qeth: handle limited IPv4 broadcast in L3 TX path
  ceph: fix error handling in ceph_get_caps()
  ceph: avoid iput_final() while holding mutex or in dispatch thread
  ceph: single workqueue for inode related works
  cgroup: css_task_iter_skip()'d iterators must be advanced before accessed
  drm/amd/amdgpu: add RLC firmware to support raven1 refresh
  drm/amd/powerplay: add set_power_profile_mode for raven1_refresh
  drm/amdgpu: fix ring test failure issue during s3 in vce 3.0 (V2)
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 450
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 449
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 448
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 446
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 445
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 444
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 443
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 442
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 441
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 440
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 438
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 437
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 436
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 435
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 434
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 433
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 432
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 431
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 430
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 429
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 428
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 426
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 424
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 423
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 422
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 421
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 420
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 419
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 418
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 417
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 416
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 414
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 412
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 411
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 410
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 409
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 408
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 407
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 406
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 405
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 404
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 403
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 402
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 401
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 400
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 399
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 398
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 397
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 396
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 395
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 394
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 393
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 392
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 391
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 390
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 389
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 388
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 387
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 380
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 378
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 377
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 376
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 375
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 373
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 372
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 371
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 370
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 367
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 365
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 364
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 363
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 362
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 354
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 353
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 352
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 351
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 350
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 349
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 348
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 347
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 346
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 345
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 344
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 343
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 342
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 341
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 340
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 339
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 338
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 336
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 335
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 334
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 333
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 332
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 330
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 328
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 326
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 325
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 324
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 323
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 322
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 321
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 320
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 316
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 315
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 314
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 313
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 312
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 311
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 310
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 309
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 308
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 307
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 305
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 301
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 300
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 299
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 297
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 296
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 295
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 294
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 292
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 291
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 290
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 289
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 288
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 287
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 286
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 285
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 284
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 283
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 282
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 281
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 280
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 278
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 277
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 276
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 275
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 274
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 273
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 272
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 271
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 270
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 269
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 268
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 267
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 266
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 265
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 264
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 263
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 262
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 260
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 258
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 257
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 256
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 254
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 253
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 252
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 251
  lib/test_stackinit: Handle Clang auto-initialization pattern
  block: Drop unlikely before IS_ERR(_OR_NULL)
  xen/swiotlb: don't initialize swiotlb twice on arm64
  s390/mm: fix address space detection in exception handling
  HID: logitech-dj: Fix 064d:c52f receiver support
  Revert "HID: core: Call request_module before doing device_add"
  Revert "HID: core: Do not call request_module() in async context"
  Revert "HID: Increase maximum report size allowed by hid_field_extract()"
  tests: fix pidfd-test compilation
  signal: improve comments
  samples: fix pidfd-metadata compilation
  arm64: arch_timer: mark functions as __always_inline
  arm64: smp: Moved cpu_logical_map[] to smp.h
  arm64: cpufeature: Fix missing ZFR0 in __read_sysreg_by_encoding()
  selftests/bpf: move test_lirc_mode2_user to TEST_GEN_PROGS_EXTENDED
  USB: Fix chipmunk-like voice when using Logitech C270 for recording audio.
  USB: usb-storage: Add new ID to ums-realtek
  udmabuf: actually unmap the scatterlist
  net: fix indirect calls helpers for ptype list hooks.
  net: ipvlan: Fix ipvlan device tso disabled while NETIF_F_IP_CSUM is set
  scsi: smartpqi: unlock on error in pqi_submit_raid_request_synchronous()
  scsi: ufs: Check that space was properly alloced in copy_query_response
  udp: only choose unbound UDP socket for multicast when not in a VRF
  net/tls: replace the sleeping lock around RX resync with a bit lock
  Revert "net/tls: avoid NULL-deref on resync during device removal"
  block: aoe: no need to check return value of debugfs_create functions
  net: dsa: sja1105: Fix link speed not working at 100 Mbps and below
  net: phylink: avoid reducing support mask
  scripts/checkstack.pl: Fix arm64 wrong or unknown architecture
  kbuild: tar-pkg: enable communication with jobserver
  kconfig: tests: fix recursive inclusion unit test
  kbuild: teach kselftest-merge to find nested config files
  nvmet: fix data_len to 0 for bdev-backed write_zeroes
  MAINTAINERS: Hand over skd maintainership
  ASoC: sun4i-i2s: Add offset to RX channel select
  ASoC: sun4i-i2s: Fix sun8i tx channel offset mask
  ASoC: max98090: remove 24-bit format support if RJ is 0
  ASoC: da7219: Fix build error without CONFIG_I2C
  ASoC: SOF: Intel: hda: Fix COMPILE_TEST build error
  drm/arm/hdlcd: Allow a bit of clock tolerance
  drm/arm/hdlcd: Actually validate CRTC modes
  drm/arm/mali-dp: Add a loop around the second set CVAL and try 5 times
  drm/komeda: fixing of DMA mapping sg segment warning
  netfilter: ipv6: nf_defrag: fix leakage of unqueued fragments
  habanalabs: Read upper bits of trace buffer from RWPHI
  arm64: arch_k3: Fix kconfig dependency warning
  drm: don't block fb changes for async plane updates
  drm/vc4: fix fb references in async update
  drm/msm: fix fb references in async update
  drm/amd: fix fb references in async update
  drm/rockchip: fix fb references in async update
  xen-blkfront: switch kcalloc to kvcalloc for large array allocation
  drm/mediatek: call mtk_dsi_stop() after mtk_drm_crtc_atomic_disable()
  drm/mediatek: clear num_pipes when unbind driver
  drm/mediatek: call drm_atomic_helper_shutdown() when unbinding driver
  drm/mediatek: unbind components in mtk_drm_unbind()
  drm/mediatek: fix unbind functions
  net: sfp: read eeprom in maximum 16 byte increments
  selftests: set sysctl bc_forwarding properly in router_broadcast.sh
  ANDROID: update gki aarch64 ABI representation
  net: ethernet: mediatek: Use NET_IP_ALIGN to judge if HW RX_2BYTE_OFFSET is enabled
  net: ethernet: mediatek: Use hw_feature to judge if HWLRO is supported
  net: ethernet: ti: cpsw_ethtool: fix ethtool ring param set
  ANDROID: gki_defconfig: Enable CMA, SLAB_FREELIST (RANDOM and HARDENED) on x86
  bpf: udp: Avoid calling reuseport's bpf_prog from udp_gro
  bpf: udp: ipv6: Avoid running reuseport's bpf_prog from __udp6_lib_err
  rcu: locking and unlocking need to always be at least barriers
  ANDROID: gki_defconfig: enable SLAB_FREELIST_RANDOM, SLAB_FREELIST_HARDENED
  ANDROID: gki_defconfig: enable CMA and increase CMA_AREAS
  ASoC: SOF: fix DSP oops definitions in FW ABI
  ASoC: hda: fix unbalanced codec dev refcount for HDA_DEV_ASOC
  ASoC: SOF: ipc: replace fw ready bitfield with explicit bit ordering
  ASoC: SOF: bump to ABI 3.6
  ASoC: SOF: soundwire: add initial soundwire support
  ASoC: SOF: uapi: mirror firmware changes
  ASoC: Intel: Baytrail: add quirk for Aegex 10 (RU2) tablet
  xfs: inode btree scrubber should calculate im_boffset correctly
  mmc: sdhci_am654: Fix SLOTTYPE write
  usb: typec: ucsi: ccg: fix memory leak in do_flash
  ANDROID: update gki aarch64 ABI representation
  habanalabs: Fix virtual address access via debugfs for 2MB pages
  drm/komeda: Constify the usage of komeda_component/pipeline/dev_funcs
  x86/power: Fix 'nosmt' vs hibernation triple fault during resume
  mm/vmalloc: Avoid rare case of flushing TLB with weird arguments
  mm/vmalloc: Fix calculation of direct map addr range
  PM: sleep: Add kerneldoc comments to some functions
  drm/i915/gvt: save RING_HEAD into vreg when vgpu switched out
  sparc: perf: fix updated event period in response to PERF_EVENT_IOC_PERIOD
  mdesc: fix a missing-check bug in get_vdev_port_node_info()
  drm/i915/gvt: add F_CMD_ACCESS flag for wa regs
  sparc64: Fix regression in non-hypervisor TLB flush xcall
  packet: unconditionally free po->rollover
  Update my email address
  net: hns: Fix loopback test failed at copper ports
  Linux 5.2-rc3
  net: dsa: mv88e6xxx: avoid error message on remove from VLAN 0
  mm, compaction: make sure we isolate a valid PFN
  include/linux/generic-radix-tree.h: fix kerneldoc comment
  kernel/signal.c: trace_signal_deliver when signal_group_exit
  drivers/iommu/intel-iommu.c: fix variable 'iommu' set but not used
  spdxcheck.py: fix directory structures
  kasan: initialize tag to 0xff in __kasan_kmalloc
  z3fold: fix sheduling while atomic
  scripts/gdb: fix invocation when CONFIG_COMMON_CLK is not set
  mm/gup: continue VM_FAULT_RETRY processing even for pre-faults
  ocfs2: fix error path kobject memory leak
  memcg: make it work on sparse non-0-node systems
  mm, memcg: consider subtrees in memory.events
  prctl_set_mm: downgrade mmap_sem to read lock
  prctl_set_mm: refactor checks from validate_prctl_map
  kernel/fork.c: make max_threads symbol static
  arch/arm/boot/compressed/decompress.c: fix build error due to lz4 changes
  arch/parisc/configs/c8000_defconfig: remove obsoleted CONFIG_DEBUG_SLAB_LEAK
  mm/vmalloc.c: fix typo in comment
  lib/sort.c: fix kernel-doc notation warnings
  mm: fix Documentation/vm/hmm.rst Sphinx warnings
  treewide: fix typos of SPDX-License-Identifier
  crypto: ux500 - fix license comment syntax error
  MAINTAINERS: add I2C DT bindings to ARM platforms
  MAINTAINERS: add DT bindings to i2c drivers
  mwifiex: Fix heap overflow in mwifiex_uap_parse_tail_ies()
  iwlwifi: mvm: change TLC config cmd sent by rs to be async
  iwlwifi: Fix double-free problems in iwl_req_fw_callback()
  iwlwifi: fix AX201 killer sku loading firmware issue
  iwlwifi: print fseq info upon fw assert
  iwlwifi: clear persistence bit according to device family
  iwlwifi: fix load in rfkill flow for unified firmware
  iwlwifi: mvm: remove d3_sram debugfs file
  bpf, riscv: clear high 32 bits for ALU32 add/sub/neg/lsh/rsh/arsh
  libbpf: Return btf_fd for load_sk_storage_btf
  HID: a4tech: fix horizontal scrolling
  HID: hyperv: Add a module description line
  net: dsa: sja1105: Don't store frame type in skb->cb
  block: print offending values when cloned rq limits are exceeded
  blk-mq: Document the blk_mq_hw_queue_to_node() arguments
  blk-mq: Fix spelling in a source code comment
  block: Fix bsg_setup_queue() kernel-doc header
  block: Fix rq_qos_wait() kernel-doc header
  block: Fix blk_mq_*_map_queues() kernel-doc headers
  block: Fix throtl_pending_timer_fn() kernel-doc header
  block: Convert blk_invalidate_devt() header into a non-kernel-doc header
  block/partitions/ldm: Convert a kernel-doc header into a non-kernel-doc header
  leds: avoid flush_work in atomic context
  cgroup: Include dying leaders with live threads in PROCS iterations
  cgroup: Implement css_task_iter_skip()
  cgroup: Call cgroup_release() before __exit_signal()
  netfilter: nf_tables: fix module autoload with inet family
  Revert "lockd: Show pid of lockd for remote locks"
  ALSA: hda/realtek - Update headset mode for ALC256
  fs/adfs: fix filename fixup handling for "/" and "//" names
  fs/adfs: move append_filetype_suffix() into adfs_object_fixup()
  fs/adfs: remove truncated filename hashing
  fs/adfs: factor out filename fixup
  fs/adfs: factor out object fixups
  fs/adfs: factor out filename case lowering
  fs/adfs: factor out filename comparison
  ovl: doc: add non-standard corner cases
  pstore/ram: Run without kernel crash dump region
  MAINTAINERS: add Vasily Gorbik and Christian Borntraeger for s390
  MAINTAINERS: Farewell Martin Schwidefsky
  pstore: Set tfm to NULL on free_buf_for_compression
  nds32: add new emulations for floating point instruction
  nds32: Avoid IEX status being incorrectly modified
  math-emu: Use statement expressions to fix Wshift-count-overflow warning
  net: correct zerocopy refcnt with udp MSG_MORE
  ethtool: Check for vlan etype or vlan tci when parsing flow_rule
  net: don't clear sock->sk early to avoid trouble in strparser
  net-gro: fix use-after-free read in napi_gro_frags()
  net: dsa: tag_8021q: Create a stable binary format
  net: dsa: tag_8021q: Change order of rx_vid setup
  net: mvpp2: fix bad MVPP2_TXQ_SCHED_TOKEN_CNTR_REG queue value
  docs cgroups: add another example size for hugetlb
  NFSv4.1: Fix bug only first CB_NOTIFY_LOCK is handled
  NFSv4.1: Again fix a race where CB_NOTIFY_LOCK fails to wake a waiter
  ipv4: tcp_input: fix stack out of bounds when parsing TCP options.
  mlxsw: spectrum: Prevent force of 56G
  mlxsw: spectrum_acl: Avoid warning after identical rules insertion
  SUNRPC: Fix a use after free when a server rejects the RPCSEC_GSS credential
  net: dsa: mv88e6xxx: fix handling of upper half of STATS_TYPE_PORT
  SUNRPC fix regression in umount of a secure mount
  r8169: fix MAC address being lost in PCI D3
  treewide: Add SPDX license identifier - Kbuild
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 225
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 224
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 223
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 222
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 221
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 220
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 218
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 217
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 216
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 215
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 214
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 213
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 211
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 210
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 209
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 207
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 206
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 203
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 201
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 200
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 199
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 198
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 197
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 195
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 194
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 193
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 191
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 190
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 188
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 185
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 183
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 182
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 180
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 179
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 178
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 177
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 176
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 175
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 174
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 173
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 172
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 171
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 170
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 167
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 166
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 165
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 164
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 162
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 161
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 160
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 159
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 158
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 157
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 156
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 155
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 154
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 153
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 151
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 150
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 149
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 148
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 147
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 145
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 144
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 143
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 142
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 140
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 139
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 138
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 137
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 136
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 135
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 133
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 132
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 131
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 130
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 129
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 128
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 127
  treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 126
  net: core: support XDP generic on stacked devices.
  netvsc: unshare skb in VF rx handler
  udp: Avoid post-GRO UDP checksum recalculation
  nvme-tcp: fix queue mapping when queue count is limited
  nvme-rdma: fix queue mapping when queue count is limited
  fpga: zynqmp-fpga: Correctly handle error pointer
  selftests: vm: install test_vmalloc.sh for run_vmtests
  userfaultfd: selftest: fix compiler warning
  kselftest/cgroup: fix incorrect test_core skip
  kselftest/cgroup: fix unexpected testing failure on test_core
  kselftest/cgroup: fix unexpected testing failure on test_memcontrol
  xtensa: Fix section mismatch between memblock_reserve and mem_reserve
  signal/ptrace: Don't leak unitialized kernel memory with PTRACE_PEEK_SIGINFO
  mwifiex: Abort at too short BSS descriptor element
  mwifiex: Fix possible buffer overflows at parsing bss descriptor
  drm/i915/gvt: Assign NULL to the pointer after memory free.
  drm/i915/gvt: Check if cur_pt_type is valid
  x86: intel_epb: Do not build when CONFIG_PM is unset
  crypto: hmac - fix memory leak in hmac_init_tfm()
  crypto: jitterentropy - change back to module_init()
  ARM: dts: Drop bogus CLKSEL for timer12 on dra7
  KVM: PPC: Book3S HV: Restore SPRG3 in kvmhv_p9_guest_entry()
  KVM: PPC: Book3S HV: Fix lockdep warning when entering guest on POWER9
  KVM: PPC: Book3S HV: XIVE: Fix page offset when clearing ESB pages
  KVM: PPC: Book3S HV: XIVE: Take the srcu read lock when accessing memslots
  KVM: PPC: Book3S HV: XIVE: Do not clear IRQ data of passthrough interrupts
  KVM: PPC: Book3S HV: XIVE: Introduce a new mutex for the XIVE device
  drm/i915/gvt: Fix cmd length of VEB_DI_IECP
  drm/i915/gvt: refine ggtt range validation
  drm/i915/gvt: Fix vGPU CSFE_CHICKEN1_REG mmio handler
  drm/i915/gvt: Fix GFX_MODE handling
  drm/i915/gvt: Update force-to-nonpriv register whitelist
  drm/i915/gvt: Initialize intel_gvt_gtt_entry in stack
  ima: show rules with IMA_INMASK correctly
  evm: check hash algorithm passed to init_desc()
  scsi: libsas: delete sas port if expander discover failed
  scsi: libsas: only clear phy->in_shutdown after shutdown event done
  scsi: scsi_dh_alua: Fix possible null-ptr-deref
  scsi: smartpqi: properly set both the DMA mask and the coherent DMA mask
  scsi: zfcp: fix to prevent port_remove with pure auto scan LUNs (only sdevs)
  scsi: zfcp: fix missing zfcp_port reference put on -EBUSY from port_remove
  scsi: libcxgbi: add a check for NULL pointer in cxgbi_check_route()
  net: phy: dp83867: Set up RGMII TX delay
  net: phy: dp83867: do not call config_init twice
  net: phy: dp83867: increase SGMII autoneg timer duration
  net: phy: dp83867: fix speed 10 in sgmii mode
  net: phy: marvell10g: report if the PHY fails to boot firmware
  net: phylink: ensure consistent phy interface mode
  cgroup: Use css_tryget() instead of css_tryget_online() in task_get_css()
  blk-mq: Fix memory leak in error handling
  usbip: usbip_host: fix stub_dev lock context imbalance regression
  net: sh_eth: fix mdio access in sh_eth_close() for R-Car Gen2 and RZ/A1 SoCs
  MIPS: uprobes: remove set but not used variable 'epc'
  s390/crypto: fix possible sleep during spinlock aquired
  MIPS: pistachio: Build uImage.gz by default
  MIPS: Make virt_addr_valid() return bool
  MIPS: Bounds check virt_addr_valid
  CIFS: cifs_read_allocate_pages: don't iterate through whole page array on ENOMEM
  RDMA/efa: Remove MAYEXEC flag check from mmap flow
  mlx5: avoid 64-bit division
  IB/hfi1: Validate page aligned for a given virtual address
  IB/{qib, hfi1, rdmavt}: Correct ibv_devinfo max_mr value
  IB/hfi1: Insure freeze_work work_struct is canceled on shutdown
  IB/rdmavt: Fix alloc_qpn() WARN_ON()
  ASoC: sun4i-codec: fix first delay on Speaker
  drm/amdgpu: reserve stollen vram for raven series
  media: venus: hfi_parser: fix a regression in parser
  selftests: bpf: fix compiler warning in flow_dissector test
  arm64: use the correct function type for __arm64_sys_ni_syscall
  arm64: use the correct function type in SYSCALL_DEFINE0
  arm64: fix syscall_fn_t type
  block: don't protect generic_make_request_checks with blk_queue_enter
  block: move blk_exit_queue into __blk_release_queue
  selftests: bpf: complete sub-register zero extension checks
  selftests: bpf: move sub-register zero extension checks into subreg.c
  ovl: detect overlapping layers
  drm/i915/icl: Add WaDisableBankHangMode
  ALSA: fireface: Use ULL suffixes for 64-bit constants
  signal/arm64: Use force_sig not force_sig_fault for SIGKILL
  nl80211: fill all policy .type entries
  mac80211: free peer keys before vif down in mesh
  ANDROID: ABI out: Use the extension .xml rather then .out
  drm/mediatek: respect page offset for PRIME mmap calls
  drm/mediatek: adjust ddp clock control flow
  ALSA: hda/realtek - Improve the headset mic for Acer Aspire laptops
  KVM: PPC: Book3S HV: XIVE: Fix the enforced limit on the vCPU identifier
  KVM: PPC: Book3S HV: XIVE: Do not test the EQ flag validity when resetting
  KVM: PPC: Book3S HV: XIVE: Clear file mapping when device is released
  KVM: PPC: Book3S HV: Don't take kvm->lock around kvm_for_each_vcpu
  KVM: PPC: Book3S: Use new mutex to synchronize access to rtas token list
  KVM: PPC: Book3S HV: Use new mutex to synchronize MMU setup
  KVM: PPC: Book3S HV: Avoid touching arch.mmu_ready in XIVE release functions
  Revert "drivers: thermal: tsens: Add new operation to check if a sensor is enabled"
  net/mlx5e: Disable rxhash when CQE compress is enabled
  net/mlx5e: restrict the real_dev of vlan device is the same as uplink device
  net/mlx5: Allocate root ns memory using kzalloc to match kfree
  net/mlx5: Avoid double free in fs init error unwinding path
  net/mlx5: Avoid double free of root ns in the error flow path
  net/mlx5: Fix error handling in mlx5_load()
  Documentation: net-sysfs: Remove duplicate PHY device documentation
  llc: fix skb leak in llc_build_and_send_ui_pkt()
  selftests: pmtu: Fix encapsulating device in pmtu_vti6_link_change_mtu
  dfs_cache: fix a wrong use of kfree in flush_cache_ent()
  fs/cifs/smb2pdu.c: fix buffer free in SMB2_ioctl_free
  cifs: fix memory leak of pneg_inbuf on -EOPNOTSUPP ioctl case
  xenbus: Avoid deadlock during suspend due to open transactions
  xen/pvcalls: Remove set but not used variable
  tracing: Avoid memory leak in predicate_parse()
  habanalabs: fix bug in checking huge page optimization
  mmc: sdhci: Fix SDIO IRQ thread deadlock
  dpaa_eth: use only online CPU portals
  net: mvneta: Fix err code path of probe
  net: stmmac: Do not output error on deferred probe
  Btrfs: fix race updating log root item during fsync
  Btrfs: fix wrong ctime and mtime of a directory after log replay
  ARC: [plat-hsdk] Get rid of inappropriate PHY settings
  ARC: [plat-hsdk]: Add support of Vivante GPU
  ARC: [plat-hsdk]: enable creg-gpio controller
  Btrfs: fix fsync not persisting changed attributes of a directory
  btrfs: qgroup: Check bg while resuming relocation to avoid NULL pointer dereference
  btrfs: reloc: Also queue orphan reloc tree for cleanup to avoid BUG_ON()
  Btrfs: incremental send, fix emission of invalid clone operations
  Btrfs: incremental send, fix file corruption when no-holes feature is enabled
  btrfs: correct zstd workspace manager lock to use spin_lock_bh()
  btrfs: Ensure replaced device doesn't have pending chunk allocation
  ia64: fix build errors by exporting paddr_to_nid()
  ASoC: SOF: Intel: hda: fix the hda init chip
  ASoC: SOF: ipc: fix a race, leading to IPC timeouts
  ASoC: SOF: control: correct the copy size for bytes kcontrol put
  ASoC: SOF: pcm: remove warning - initialize workqueue on open
  ASoC: SOF: pcm: clear hw_params_upon_resume flag correctly
  ASoC: SOF: core: fix error handling with the probe workqueue
  ASoC: SOF: core: remove snd_soc_unregister_component in case of error
  ASoC: SOF: core: remove DSP after unregistering machine driver
  ASoC: soc-core: fixup references at soc_cleanup_card_resources()
  arm64/module: revert to unsigned interpretation of ABS16/32 relocations
  KVM: s390: Do not report unusabled IDs via KVM_CAP_MAX_VCPU_ID
  kvm: fix compile on s390 part 2
  xprtrdma: Use struct_size() in kzalloc()
  tools headers UAPI: Sync kvm.h headers with the kernel sources
  perf record: Fix s390 missing module symbol and warning for non-root users
  perf machine: Read also the end of the kernel
  perf test vmlinux-kallsyms: Ignore aliases to _etext when searching on kallsyms
  perf session: Add missing swap ops for namespace events
  perf namespace: Protect reading thread's namespace
  tools headers UAPI: Sync drm/drm.h with the kernel
  s390/crypto: fix gcm-aes-s390 selftest failures
  s390/zcrypt: Fix wrong dispatching for control domain CPRBs
  s390/pci: fix assignment of bus resources
  s390/pci: fix struct definition for set PCI function
  s390: mark __cpacf_check_opcode() and cpacf_query_func() as __always_inline
  s390: add unreachable() to dump_fault_info() to fix -Wmaybe-uninitialized
  tools headers UAPI: Sync drm/i915_drm.h with the kernel
  tools headers UAPI: Sync linux/fs.h with the kernel
  tools headers UAPI: Sync linux/sched.h with the kernel
  tools arch x86: Sync asm/cpufeatures.h with the with the kernel
  tools include UAPI: Update copy of files related to new fspick, fsmount, fsconfig, fsopen, move_mount and open_tree syscalls
  perf arm64: Fix mksyscalltbl when system kernel headers are ahead of the kernel
  perf data: Fix 'strncat may truncate' build failure with recent gcc
  arm64: Fix the arm64_personality() syscall wrapper redirection
  rtw88: Make some symbols static
  rtw88: avoid circular locking between local->iflist_mtx and rtwdev->mutex
  rsi: Properly initialize data in rsi_sdio_ta_reset
  rtw88: fix unassigned rssi_level in rtw_sta_info
  rtw88: fix subscript above array bounds compiler warning
  fuse: extract helper for range writeback
  fuse: fix copy_file_range() in the writeback case
  mmc: meson-gx: fix irq ack
  mmc: tmio: fix SCC error handling to avoid false positive CRC error
  mmc: tegra: Fix a warning message
  memstick: mspro_block: Fix an error code in mspro_block_issue_req()
  mac80211: mesh: fix RCU warning
  nl80211: fix station_info pertid memory leak
  mac80211: Do not use stack memory with scatterlist for GMAC
  ALSA: line6: Assure canceling delayed work at disconnection
  configfs: Fix use-after-free when accessing sd->s_dentry
  ALSA: hda - Force polling mode on CNL for fixing codec communication
  i2c: synquacer: fix synquacer_i2c_doxfer() return value
  i2c: mlxcpld: Fix wrong initialization order in probe
  i2c: dev: fix potential memory leak in i2cdev_ioctl_rdwr
  RDMA/core: Fix panic when port_data isn't initialized
  RDMA/uverbs: Pass udata on uverbs error unwind
  RDMA/core: Clear out the udata before error unwind
  net: aquantia: tcp checksum 0xffff being handled incorrectly
  net: aquantia: fix LRO with FCS error
  net: aquantia: check rx csum for all packets in LRO session
  net: aquantia: tx clean budget logic error
  vhost: scsi: add weight support
  vhost: vsock: add weight support
  vhost_net: fix possible infinite loop
  vhost: introduce vhost_exceeds_weight()
  virtio: Fix indentation of VIRTIO_MMIO
  virtio: add unlikely() to WARN_ON_ONCE()
  iommu/vt-d: Set the right field for Page Walk Snoop
  iommu/vt-d: Fix lock inversion between iommu->lock and device_domain_lock
  iommu: Add missing new line for dma type
  drm/etnaviv: lock MMU while dumping core
  block: Don't revalidate bdev of hidden gendisk
  loop: Don't change loop device under exclusive opener
  drm/imx: ipuv3-plane: fix atomic update status query for non-plus i.MX6Q
  drm/qxl: drop WARN_ONCE()
  iio: temperature: mlx90632 Relax the compatibility check
  iio: imu: st_lsm6dsx: fix PM support for st_lsm6dsx i2c controller
  staging:iio:ad7150: fix threshold mode config bit
  fuse: add FUSE_WRITE_KILL_PRIV
  fuse: fallocate: fix return with locked inode
  PCI: PM: Avoid possible suspend-to-idle issue
  ACPI: PM: Call pm_set_suspend_via_firmware() during hibernation
  ACPI/PCI: PM: Add missing wakeup.flags.valid checks
  ovl: support the FS_IOC_FS[SG]ETXATTR ioctls
  soundwire: stream: fix out of boundary access on port properties
  net: tulip: de4x5: Drop redundant MODULE_DEVICE_TABLE()
  selftests/tls: add test for sleeping even though there is data
  net/tls: fix no wakeup on partial reads
  selftests/tls: test for lowat overshoot with multiple records
  net/tls: fix lowat calculation if some data came from previous record
  dpaa2-eth: Make constant 64-bit long
  dpaa2-eth: Use PTR_ERR_OR_ZERO where appropriate
  dpaa2-eth: Fix potential spectre issue
  bonding/802.3ad: fix slave link initialization transition states
  io_uring: Fix __io_uring_register() false success
  net: ethtool: Document get_rxfh_context and set_rxfh_context ethtool ops
  net: stmmac: dwmac-mediatek: modify csr_clk value to fix mdio read/write fail
  net: stmmac: fix csr_clk can't be zero issue
  net: stmmac: update rx tail pointer register to fix rx dma hang issue.
  ip_sockglue: Fix missing-check bug in ip_ra_control()
  ipv6_sockglue: Fix a missing-check bug in ip6_ra_control()
  efi: Allow the number of EFI configuration tables entries to be zero
  efi/x86/Add missing error handling to old_memmap 1:1 mapping code
  parisc: Fix compiler warnings in float emulation code
  parisc/slab: cleanup after /proc/slab_allocators removal
  bpf: sockmap, fix use after free from sleep in psock backlog workqueue
  net: sched: don't use tc_action->order during action dump
  cxgb4: Revert "cxgb4: Remove SGE_HOST_PAGE_SIZE dependency on page size"
  net: fec: fix the clk mismatch in failed_reset path
  habanalabs: Avoid using a non-initialized MMU cache mutex
  habanalabs: fix debugfs code
  uapi/habanalabs: add opcode for enable/disable device debug mode
  habanalabs: halt debug engines on user process close
  selftests: rtc: rtctest: specify timeouts
  selftests/harness: Allow test to configure timeout
  selftests/ftrace: Add checkbashisms meta-testcase
  selftests/ftrace: Make a script checkbashisms clean
  media: smsusb: better handle optional alignment
  test_firmware: Use correct snprintf() limit
  genwqe: Prevent an integer overflow in the ioctl
  parport: Fix mem leak in parport_register_dev_model
  fpga: dfl: expand minor range when registering chrdev region
  fpga: dfl: Add lockdep classes for pdata->lock
  fpga: dfl: afu: Pass the correct device to dma_mapping_error()
  fpga: stratix10-soc: fix use-after-free on s10_init()
  w1: ds2408: Fix typo after 49695ac468 (reset on output_write retry with readback)
  kheaders: Do not regenerate archive if config is not changed
  kheaders: Move from proc to sysfs
  drm/amd/display: Don't load DMCU for Raven 1 (v2)
  drm/i915: Maintain consistent documentation subsection ordering
  scripts/sphinx-pre-install: make it handle Sphinx versions
  docs: Fix conf.py for Sphinx 2.0
  vt/fbcon: deinitialize resources in visual_init() after failed memory allocation
  xfs: fix broken log reservation debugging
  clocksource/drivers/timer-ti-dm: Change to new style declaration
  ASoC: core: lock client_mutex while removing link components
  ASoC: simple-card: Restore original configuration of DAI format
  {nl,mac}80211: allow 4addr AP operation on crypto controlled devices
  mac80211_hwsim: mark expected switch fall-through
  mac80211: fix rate reporting inside cfg80211_calculate_bitrate_he()
  mac80211: remove set but not used variable 'old'
  mac80211: handle deauthentication/disassociation from TDLS peer
  gpio: fix gpio-adp5588 build errors
  pinctrl: stmfx: Fix compile issue when CONFIG_OF_GPIO is not defined
  staging: kpc2000: Add dependency on MFD_CORE to kconfig symbol 'KPC2000'
  perf/ring-buffer: Use regular variables for nesting
  perf/ring-buffer: Always use {READ,WRITE}_ONCE() for rb->user_page data
  perf/ring_buffer: Add ordering to rb->nest increment
  perf/ring_buffer: Fix exposing a temporarily decreased data_head
  x86/CPU/AMD: Don't force the CPB cap when running under a hypervisor
  x86/boot: Provide KASAN compatible aliases for string routines
  ALSA: hda/realtek - Enable micmute LED for Huawei laptops
  Input: uinput - add compat ioctl number translation for UI_*_FF_UPLOAD
  Input: silead - add MSSL0017 to acpi_device_id
  cxgb4: offload VLAN flows regardless of VLAN ethtype
  hsr: fix don't prune the master node from the node_db
  net: mvpp2: cls: Fix leaked ethtool_rx_flow_rule
  docs: fix multiple doc build warnings in enumeration.rst
  lib/list_sort: fix kerneldoc build error
  docs: fix numaperf.rst and add it to the doc tree
  doc: Cope with the deprecation of AutoReporter
  doc: Cope with Sphinx logging deprecations
  bpf: sockmap, restore sk_write_space when psock gets dropped
  selftests: bpf: add zero extend checks for ALU32 and/or/xor
  bpf, riscv: clear target register high 32-bits for and/or/xor on ALU32
  spi: abort spi_sync if failed to prepare_transfer_hardware
  ALSA: hda/realtek - Set default power save node to 0
  ipv4/igmp: fix build error if !CONFIG_IP_MULTICAST
  powerpc/kexec: Fix loading of kernel + initramfs with kexec_file_load()
  MIPS: TXx9: Fix boot crash in free_initmem()
  MIPS: remove a space after -I to cope with header search paths for VDSO
  MIPS: mark ginvt() as __always_inline
  ipv4/igmp: fix another memory leak in igmpv3_del_delrec()
  bnxt_en: Device serial number is supported only for PFs.
  bnxt_en: Reduce memory usage when running in kdump kernel.
  bnxt_en: Fix possible BUG() condition when calling pci_disable_msix().
  bnxt_en: Fix aggregation buffer leak under OOM condition.
  ipv6: Fix redirect with VRF
  net: stmmac: fix reset gpio free missing
  mISDN: make sure device name is NUL terminated
  net: macb: save/restore the remaining registers and features
  media: dvb: warning about dvb frequency limits produces too much noise
  net/tls: don't ignore netdev notifications if no TLS features
  net/tls: fix state removal with feature flags off
  net/tls: avoid NULL-deref on resync during device removal
  Documentation: add TLS offload documentation
  Documentation: tls: RSTify the ktls documentation
  Documentation: net: move device drivers docs to a submenu
  mISDN: Fix indenting in dsp_cmx.c
  ocelot: Dont allocate another multicast list, use __dev_mc_sync
  Validate required parameters in inet6_validate_link_af
  xhci: Use %zu for printing size_t type
  xhci: Convert xhci_handshake() to use readl_poll_timeout_atomic()
  xhci: Fix immediate data transfer if buffer is already DMA mapped
  usb: xhci: avoid null pointer deref when bos field is NULL
  usb: xhci: Fix a potential null pointer dereference in xhci_debugfs_create_endpoint()
  xhci: update bounce buffer with correct sg num
  media: usb: siano: Fix false-positive "uninitialized variable" warning
  spi: spi-fsl-spi: call spi_finalize_current_message() at the end
  ALSA: hda/realtek - Check headset type by unplug and resume
  powerpc/perf: Fix MMCRA corruption by bhrb_filter
  powerpc/powernv: Return for invalid IMC domain
  HID: logitech-hidpp: Add support for the S510 remote control
  HID: multitouch: handle faulty Elo touch device
  selftests: netfilter: add flowtable test script
  netfilter: nft_flow_offload: IPCB is only valid for ipv4 family
  netfilter: nft_flow_offload: don't offload when sequence numbers need adjustment
  netfilter: nft_flow_offload: set liberal tracking mode for tcp
  netfilter: nf_flow_table: ignore DF bit setting
  ASoC: Intel: sof-rt5682: fix AMP quirk support
  ASoC: Intel: sof-rt5682: fix for codec button mapping
  clk: ti: clkctrl: Fix clkdm_clk handling
  clk: imx: imx8mm: fix int pll clk gate
  clk: sifive: restrict Kconfig scope for the FU540 PRCI driver
  RDMA/hns: Fix PD memory leak for internal allocation
  netfilter: nat: fix udp checksum corruption
  selftests: netfilter: missing error check when setting up veth interface
  RDMA/srp: Rename SRP sysfs name after IB device rename trigger
  ipvs: Fix use-after-free in ip_vs_in
  ARC: [plat-hsdk]: Add missing FIFO size entry in GMAC node
  ARC: [plat-hsdk]: Add missing multicast filter bins number to GMAC node
  samples, bpf: suppress compiler warning
  samples, bpf: fix to change the buffer size for read()
  bpf: Check sk_fullsock() before returning from bpf_sk_lookup()
  bpf: fix out-of-bounds read in __bpf_skc_lookup
  Documentation/networking: fix af_xdp.rst Sphinx warnings
  netfilter: nft_fib: Fix existence check support
  netfilter: nf_queue: fix reinject verdict handling
  dmaengine: sprd: Add interrupt support for 2-stage transfer
  dmaengine: sprd: Fix the right place to configure 2-stage transfer
  dmaengine: sprd: Fix block length overflow
  dmaengine: sprd: Fix the incorrect start for 2-stage destination channels
  dmaengine: sprd: Add validation of current descriptor in irq handler
  dmaengine: sprd: Fix the possible crash when getting descriptor status
  tty: max310x: Fix external crystal register setup
  serial: sh-sci: disable DMA for uart_console
  serial: imx: remove log spamming error message
  tty: serial: msm_serial: Fix XON/XOFF
  USB: serial: option: add Telit 0x1260 and 0x1261 compositions
  USB: serial: pl2303: add Allied Telesis VT-Kit3
  USB: serial: option: add support for Simcom SIM7500/SIM7600 RNDIS mode
  dmaengine: tegra210-adma: Fix spelling
  dmaengine: tegra210-adma: Fix channel FIFO configuration
  dmaengine: tegra210-adma: Fix crash during probe
  dmaengine: mediatek-cqdma: sleeping in atomic context
  dmaengine: dw-axi-dmac: fix null dereference when pointer first is null
  perf/x86/intel/ds: Fix EVENT vs. UEVENT PEBS constraints
  USB: rio500: update Documentation
  USB: rio500: simplify locking
  USB: rio500: fix memory leak in close after disconnect
  USB: rio500: refuse more than one device at a time
  usbip: usbip_host: fix BUG: sleeping function called from invalid context
  USB: sisusbvga: fix oops in error path of sisusb_probe
  USB: Add LPM quirk for Surface Dock GigE adapter
  media: usb: siano: Fix general protection fault in smsusb
  usb: mtu3: fix up undefined reference to usb_debug_root
  USB: Fix slab-out-of-bounds write in usb_get_bos_descriptor
  Input: elantech - enable middle button support on 2 ThinkPads
  dmaengine: fsl-qdma: Add improvement
  dmaengine: jz4780: Fix transfers being ACKed too soon
  gcc-plugins: Fix build failures under Darwin host
  MAINTAINERS: Update Stefan Wahren email address
  netfilter: nf_tables: fix oops during rule dump
  ARC: mm: SIGSEGV userspace trying to access kernel virtual memory
  ARC: fix build warnings
  ARM: dts: bcm: Add missing device_type = "memory" property
  soc: bcm: brcmstb: biuctrl: Register writes require a barrier
  soc: brcmstb: Fix error path for unsupported CPUs
  ARM: dts: dra71x: Disable usb4_tm target module
  ARM: dts: dra71x: Disable rtc target module
  ARM: dts: dra76x: Disable usb4_tm target module
  ARM: dts: dra76x: Disable rtc target module
  ASoC: simple-card: Fix configuration of DAI format
  ASoC: Intel: soc-acpi: Fix machine selection order
  ASoC: rt5677-spi: Handle over reading when flipping bytes
  ASoC: soc-dpm: fixup DAI active unbalance
  pinctrl: intel: Clear interrupt status in mask/unmask callback
  pinctrl: intel: Use GENMASK() consistently
  parisc: Allow building 64-bit kernel without -mlong-calls compiler option
  parisc: Kconfig: remove ARCH_DISCARD_MEMBLOCK
  staging: wilc1000: Fix some double unlock bugs in wilc_wlan_cleanup()
  staging: vc04_services: prevent integer overflow in create_pagelist()
  Staging: vc04_services: Fix a couple error codes
  staging: wlan-ng: fix adapter initialization failure
  staging: kpc2000: double unlock in error handling in kpc_dma_transfer()
  staging: kpc2000: Fix build error without CONFIG_UIO
  staging: kpc2000: fix build error on xtensa
  staging: erofs: set sb->s_root to NULL when failing from __getname()
  ARM: imx: cpuidle-imx6sx: Restrict the SW2ISO increase to i.MX6SX
  firmware: imx: SCU irq should ONLY be enabled after SCU IPC is ready
  arm64: imx: Fix build error without CONFIG_SOC_BUS
  ima: fix wrong signed policy requirement when not appraising
  x86/ima: Check EFI_RUNTIME_SERVICES before using
  stacktrace: Unbreak stack_trace_save_tsk_reliable()
  HID: wacom: Sync INTUOSP2_BT touch state after each frame if necessary
  HID: wacom: Correct button numbering 2nd-gen Intuos Pro over Bluetooth
  HID: wacom: Send BTN_TOUCH in response to INTUOSP2_BT eraser contact
  HID: wacom: Don't report anything prior to the tool entering range
  HID: wacom: Don't set tool type until we're in range
  ASoC: cs42xx8: Add regcache mask dirty
  regulator: tps6507x: Fix boot regression due to testing wrong init_data pointer
  ASoC: fsl_asrc: Fix the issue about unsupported rate
  spi: bitbang: Fix NULL pointer dereference in spi_unregister_master
  Input: elan_i2c - increment wakeup count if wake source
  wireless: Skip directory when generating certificates
  ASoC: ak4458: rstn_control - return a non-zero on error only
  ASoC: soc-pcm: BE dai needs prepare when pause release after resume
  ASoC: ak4458: add return value for ak4458_probe
  ASoC : cs4265 : readable register too low
  ASoC: SOF: fix error in verbose ipc command parsing
  ASoC: SOF: fix race in FW boot timeout handling
  ASoC: SOF: nocodec: fix undefined reference
  iio: adc: ti-ads8688: fix timestamp is not updated in buffer
  iio: dac: ds4422/ds4424 fix chip verification
  HID: rmi: Use SET_REPORT request on control endpoint for Acer Switch 3 and 5
  HID: logitech-hidpp: add support for the MX5500 keyboard
  HID: logitech-dj: add support for the Logitech MX5500's Bluetooth Mini-Receiver
  HID: i2c-hid: add iBall Aer3 to descriptor override
  spi: Fix Raspberry Pi breakage
  ARM: dts: dra76x: Update MMC2_HS200_MANUAL1 iodelay values
  ARM: dts: am57xx-idk: Remove support for voltage switching for SD card
  bus: ti-sysc: Handle devices with no control registers
  ARM: dts: Configure osc clock for d_can on am335x
  iio: imu: mpu6050: Fix FIFO layout for ICM20602
  lkdtm/bugs: Adjust recursion test to avoid elision
  lkdtm/usercopy: Moves the KERNEL_DS test to non-canonical
  iio: adc: ads124: avoid buffer overflow
  iio: adc: modify NPCM ADC read reference voltage

Change-Id: I98c823993370027391cc21dfb239c3049f025136
Signed-off-by: Raghavendra Rao Ananta <rananta@codeaurora.org>
2019-07-01 17:41:24 -07:00

3186 lines
88 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2015 Shaohua Li <shli@fb.com>
* Copyright (C) 2016 Song Liu <songliubraving@fb.com>
*/
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/raid/md_p.h>
#include <linux/crc32c.h>
#include <linux/random.h>
#include <linux/kthread.h>
#include <linux/types.h>
#include "md.h"
#include "raid5.h"
#include "md-bitmap.h"
#include "raid5-log.h"
/*
* metadata/data stored in disk with 4k size unit (a block) regardless
* underneath hardware sector size. only works with PAGE_SIZE == 4096
*/
#define BLOCK_SECTORS (8)
#define BLOCK_SECTOR_SHIFT (3)
/*
* log->max_free_space is min(1/4 disk size, 10G reclaimable space).
*
* In write through mode, the reclaim runs every log->max_free_space.
* This can prevent the recovery scans for too long
*/
#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
/* wake up reclaim thread periodically */
#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
/* start flush with these full stripes */
#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
/* reclaim stripes in groups */
#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
/*
* We only need 2 bios per I/O unit to make progress, but ensure we
* have a few more available to not get too tight.
*/
#define R5L_POOL_SIZE 4
static char *r5c_journal_mode_str[] = {"write-through",
"write-back"};
/*
* raid5 cache state machine
*
* With the RAID cache, each stripe works in two phases:
* - caching phase
* - writing-out phase
*
* These two phases are controlled by bit STRIPE_R5C_CACHING:
* if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
* if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
*
* When there is no journal, or the journal is in write-through mode,
* the stripe is always in writing-out phase.
*
* For write-back journal, the stripe is sent to caching phase on write
* (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
* the write-out phase by clearing STRIPE_R5C_CACHING.
*
* Stripes in caching phase do not write the raid disks. Instead, all
* writes are committed from the log device. Therefore, a stripe in
* caching phase handles writes as:
* - write to log device
* - return IO
*
* Stripes in writing-out phase handle writes as:
* - calculate parity
* - write pending data and parity to journal
* - write data and parity to raid disks
* - return IO for pending writes
*/
struct r5l_log {
struct md_rdev *rdev;
u32 uuid_checksum;
sector_t device_size; /* log device size, round to
* BLOCK_SECTORS */
sector_t max_free_space; /* reclaim run if free space is at
* this size */
sector_t last_checkpoint; /* log tail. where recovery scan
* starts from */
u64 last_cp_seq; /* log tail sequence */
sector_t log_start; /* log head. where new data appends */
u64 seq; /* log head sequence */
sector_t next_checkpoint;
struct mutex io_mutex;
struct r5l_io_unit *current_io; /* current io_unit accepting new data */
spinlock_t io_list_lock;
struct list_head running_ios; /* io_units which are still running,
* and have not yet been completely
* written to the log */
struct list_head io_end_ios; /* io_units which have been completely
* written to the log but not yet written
* to the RAID */
struct list_head flushing_ios; /* io_units which are waiting for log
* cache flush */
struct list_head finished_ios; /* io_units which settle down in log disk */
struct bio flush_bio;
struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
struct kmem_cache *io_kc;
mempool_t io_pool;
struct bio_set bs;
mempool_t meta_pool;
struct md_thread *reclaim_thread;
unsigned long reclaim_target; /* number of space that need to be
* reclaimed. if it's 0, reclaim spaces
* used by io_units which are in
* IO_UNIT_STRIPE_END state (eg, reclaim
* dones't wait for specific io_unit
* switching to IO_UNIT_STRIPE_END
* state) */
wait_queue_head_t iounit_wait;
struct list_head no_space_stripes; /* pending stripes, log has no space */
spinlock_t no_space_stripes_lock;
bool need_cache_flush;
/* for r5c_cache */
enum r5c_journal_mode r5c_journal_mode;
/* all stripes in r5cache, in the order of seq at sh->log_start */
struct list_head stripe_in_journal_list;
spinlock_t stripe_in_journal_lock;
atomic_t stripe_in_journal_count;
/* to submit async io_units, to fulfill ordering of flush */
struct work_struct deferred_io_work;
/* to disable write back during in degraded mode */
struct work_struct disable_writeback_work;
/* to for chunk_aligned_read in writeback mode, details below */
spinlock_t tree_lock;
struct radix_tree_root big_stripe_tree;
};
/*
* Enable chunk_aligned_read() with write back cache.
*
* Each chunk may contain more than one stripe (for example, a 256kB
* chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
* chunk_aligned_read, these stripes are grouped into one "big_stripe".
* For each big_stripe, we count how many stripes of this big_stripe
* are in the write back cache. These data are tracked in a radix tree
* (big_stripe_tree). We use radix_tree item pointer as the counter.
* r5c_tree_index() is used to calculate keys for the radix tree.
*
* chunk_aligned_read() calls r5c_big_stripe_cached() to look up
* big_stripe of each chunk in the tree. If this big_stripe is in the
* tree, chunk_aligned_read() aborts. This look up is protected by
* rcu_read_lock().
*
* It is necessary to remember whether a stripe is counted in
* big_stripe_tree. Instead of adding new flag, we reuses existing flags:
* STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
* two flags are set, the stripe is counted in big_stripe_tree. This
* requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
* r5c_try_caching_write(); and moving clear_bit of
* STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
* r5c_finish_stripe_write_out().
*/
/*
* radix tree requests lowest 2 bits of data pointer to be 2b'00.
* So it is necessary to left shift the counter by 2 bits before using it
* as data pointer of the tree.
*/
#define R5C_RADIX_COUNT_SHIFT 2
/*
* calculate key for big_stripe_tree
*
* sect: align_bi->bi_iter.bi_sector or sh->sector
*/
static inline sector_t r5c_tree_index(struct r5conf *conf,
sector_t sect)
{
sector_t offset;
offset = sector_div(sect, conf->chunk_sectors);
return sect;
}
/*
* an IO range starts from a meta data block and end at the next meta data
* block. The io unit's the meta data block tracks data/parity followed it. io
* unit is written to log disk with normal write, as we always flush log disk
* first and then start move data to raid disks, there is no requirement to
* write io unit with FLUSH/FUA
*/
struct r5l_io_unit {
struct r5l_log *log;
struct page *meta_page; /* store meta block */
int meta_offset; /* current offset in meta_page */
struct bio *current_bio;/* current_bio accepting new data */
atomic_t pending_stripe;/* how many stripes not flushed to raid */
u64 seq; /* seq number of the metablock */
sector_t log_start; /* where the io_unit starts */
sector_t log_end; /* where the io_unit ends */
struct list_head log_sibling; /* log->running_ios */
struct list_head stripe_list; /* stripes added to the io_unit */
int state;
bool need_split_bio;
struct bio *split_bio;
unsigned int has_flush:1; /* include flush request */
unsigned int has_fua:1; /* include fua request */
unsigned int has_null_flush:1; /* include null flush request */
unsigned int has_flush_payload:1; /* include flush payload */
/*
* io isn't sent yet, flush/fua request can only be submitted till it's
* the first IO in running_ios list
*/
unsigned int io_deferred:1;
struct bio_list flush_barriers; /* size == 0 flush bios */
};
/* r5l_io_unit state */
enum r5l_io_unit_state {
IO_UNIT_RUNNING = 0, /* accepting new IO */
IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
* don't accepting new bio */
IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
};
bool r5c_is_writeback(struct r5l_log *log)
{
return (log != NULL &&
log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
}
static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
{
start += inc;
if (start >= log->device_size)
start = start - log->device_size;
return start;
}
static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
sector_t end)
{
if (end >= start)
return end - start;
else
return end + log->device_size - start;
}
static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
{
sector_t used_size;
used_size = r5l_ring_distance(log, log->last_checkpoint,
log->log_start);
return log->device_size > used_size + size;
}
static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
enum r5l_io_unit_state state)
{
if (WARN_ON(io->state >= state))
return;
io->state = state;
}
static void
r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
{
struct bio *wbi, *wbi2;
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
md_write_end(conf->mddev);
bio_endio(wbi);
wbi = wbi2;
}
}
void r5c_handle_cached_data_endio(struct r5conf *conf,
struct stripe_head *sh, int disks)
{
int i;
for (i = sh->disks; i--; ) {
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
}
}
}
void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
/* Check whether we should flush some stripes to free up stripe cache */
void r5c_check_stripe_cache_usage(struct r5conf *conf)
{
int total_cached;
if (!r5c_is_writeback(conf->log))
return;
total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
atomic_read(&conf->r5c_cached_full_stripes);
/*
* The following condition is true for either of the following:
* - stripe cache pressure high:
* total_cached > 3/4 min_nr_stripes ||
* empty_inactive_list_nr > 0
* - stripe cache pressure moderate:
* total_cached > 1/2 min_nr_stripes
*/
if (total_cached > conf->min_nr_stripes * 1 / 2 ||
atomic_read(&conf->empty_inactive_list_nr) > 0)
r5l_wake_reclaim(conf->log, 0);
}
/*
* flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
* stripes in the cache
*/
void r5c_check_cached_full_stripe(struct r5conf *conf)
{
if (!r5c_is_writeback(conf->log))
return;
/*
* wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
* or a full stripe (chunk size / 4k stripes).
*/
if (atomic_read(&conf->r5c_cached_full_stripes) >=
min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
conf->chunk_sectors >> STRIPE_SHIFT))
r5l_wake_reclaim(conf->log, 0);
}
/*
* Total log space (in sectors) needed to flush all data in cache
*
* To avoid deadlock due to log space, it is necessary to reserve log
* space to flush critical stripes (stripes that occupying log space near
* last_checkpoint). This function helps check how much log space is
* required to flush all cached stripes.
*
* To reduce log space requirements, two mechanisms are used to give cache
* flush higher priorities:
* 1. In handle_stripe_dirtying() and schedule_reconstruction(),
* stripes ALREADY in journal can be flushed w/o pending writes;
* 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
* can be delayed (r5l_add_no_space_stripe).
*
* In cache flush, the stripe goes through 1 and then 2. For a stripe that
* already passed 1, flushing it requires at most (conf->max_degraded + 1)
* pages of journal space. For stripes that has not passed 1, flushing it
* requires (conf->raid_disks + 1) pages of journal space. There are at
* most (conf->group_cnt + 1) stripe that passed 1. So total journal space
* required to flush all cached stripes (in pages) is:
*
* (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
* (group_cnt + 1) * (raid_disks + 1)
* or
* (stripe_in_journal_count) * (max_degraded + 1) +
* (group_cnt + 1) * (raid_disks - max_degraded)
*/
static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
{
struct r5l_log *log = conf->log;
if (!r5c_is_writeback(log))
return 0;
return BLOCK_SECTORS *
((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
(conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
}
/*
* evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
*
* R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
* reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
* device is less than 2x of reclaim_required_space.
*/
static inline void r5c_update_log_state(struct r5l_log *log)
{
struct r5conf *conf = log->rdev->mddev->private;
sector_t free_space;
sector_t reclaim_space;
bool wake_reclaim = false;
if (!r5c_is_writeback(log))
return;
free_space = r5l_ring_distance(log, log->log_start,
log->last_checkpoint);
reclaim_space = r5c_log_required_to_flush_cache(conf);
if (free_space < 2 * reclaim_space)
set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
else {
if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
wake_reclaim = true;
clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
}
if (free_space < 3 * reclaim_space)
set_bit(R5C_LOG_TIGHT, &conf->cache_state);
else
clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
if (wake_reclaim)
r5l_wake_reclaim(log, 0);
}
/*
* Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
* This function should only be called in write-back mode.
*/
void r5c_make_stripe_write_out(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
struct r5l_log *log = conf->log;
BUG_ON(!r5c_is_writeback(log));
WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
clear_bit(STRIPE_R5C_CACHING, &sh->state);
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
}
static void r5c_handle_data_cached(struct stripe_head *sh)
{
int i;
for (i = sh->disks; i--; )
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
set_bit(R5_InJournal, &sh->dev[i].flags);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
}
clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
}
/*
* this journal write must contain full parity,
* it may also contain some data pages
*/
static void r5c_handle_parity_cached(struct stripe_head *sh)
{
int i;
for (i = sh->disks; i--; )
if (test_bit(R5_InJournal, &sh->dev[i].flags))
set_bit(R5_Wantwrite, &sh->dev[i].flags);
}
/*
* Setting proper flags after writing (or flushing) data and/or parity to the
* log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
*/
static void r5c_finish_cache_stripe(struct stripe_head *sh)
{
struct r5l_log *log = sh->raid_conf->log;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
/*
* Set R5_InJournal for parity dev[pd_idx]. This means
* all data AND parity in the journal. For RAID 6, it is
* NOT necessary to set the flag for dev[qd_idx], as the
* two parities are written out together.
*/
set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
} else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
r5c_handle_data_cached(sh);
} else {
r5c_handle_parity_cached(sh);
set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
}
}
static void r5l_io_run_stripes(struct r5l_io_unit *io)
{
struct stripe_head *sh, *next;
list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
list_del_init(&sh->log_list);
r5c_finish_cache_stripe(sh);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
}
static void r5l_log_run_stripes(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
lockdep_assert_held(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
/* don't change list order */
if (io->state < IO_UNIT_IO_END)
break;
list_move_tail(&io->log_sibling, &log->finished_ios);
r5l_io_run_stripes(io);
}
}
static void r5l_move_to_end_ios(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
lockdep_assert_held(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
/* don't change list order */
if (io->state < IO_UNIT_IO_END)
break;
list_move_tail(&io->log_sibling, &log->io_end_ios);
}
}
static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
static void r5l_log_endio(struct bio *bio)
{
struct r5l_io_unit *io = bio->bi_private;
struct r5l_io_unit *io_deferred;
struct r5l_log *log = io->log;
unsigned long flags;
bool has_null_flush;
bool has_flush_payload;
if (bio->bi_status)
md_error(log->rdev->mddev, log->rdev);
bio_put(bio);
mempool_free(io->meta_page, &log->meta_pool);
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
/*
* if the io doesn't not have null_flush or flush payload,
* it is not safe to access it after releasing io_list_lock.
* Therefore, it is necessary to check the condition with
* the lock held.
*/
has_null_flush = io->has_null_flush;
has_flush_payload = io->has_flush_payload;
if (log->need_cache_flush && !list_empty(&io->stripe_list))
r5l_move_to_end_ios(log);
else
r5l_log_run_stripes(log);
if (!list_empty(&log->running_ios)) {
/*
* FLUSH/FUA io_unit is deferred because of ordering, now we
* can dispatch it
*/
io_deferred = list_first_entry(&log->running_ios,
struct r5l_io_unit, log_sibling);
if (io_deferred->io_deferred)
schedule_work(&log->deferred_io_work);
}
spin_unlock_irqrestore(&log->io_list_lock, flags);
if (log->need_cache_flush)
md_wakeup_thread(log->rdev->mddev->thread);
/* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
if (has_null_flush) {
struct bio *bi;
WARN_ON(bio_list_empty(&io->flush_barriers));
while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
bio_endio(bi);
if (atomic_dec_and_test(&io->pending_stripe)) {
__r5l_stripe_write_finished(io);
return;
}
}
}
/* decrease pending_stripe for flush payload */
if (has_flush_payload)
if (atomic_dec_and_test(&io->pending_stripe))
__r5l_stripe_write_finished(io);
}
static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
{
unsigned long flags;
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
spin_unlock_irqrestore(&log->io_list_lock, flags);
/*
* In case of journal device failures, submit_bio will get error
* and calls endio, then active stripes will continue write
* process. Therefore, it is not necessary to check Faulty bit
* of journal device here.
*
* We can't check split_bio after current_bio is submitted. If
* io->split_bio is null, after current_bio is submitted, current_bio
* might already be completed and the io_unit is freed. We submit
* split_bio first to avoid the issue.
*/
if (io->split_bio) {
if (io->has_flush)
io->split_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->split_bio->bi_opf |= REQ_FUA;
submit_bio(io->split_bio);
}
if (io->has_flush)
io->current_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->current_bio->bi_opf |= REQ_FUA;
submit_bio(io->current_bio);
}
/* deferred io_unit will be dispatched here */
static void r5l_submit_io_async(struct work_struct *work)
{
struct r5l_log *log = container_of(work, struct r5l_log,
deferred_io_work);
struct r5l_io_unit *io = NULL;
unsigned long flags;
spin_lock_irqsave(&log->io_list_lock, flags);
if (!list_empty(&log->running_ios)) {
io = list_first_entry(&log->running_ios, struct r5l_io_unit,
log_sibling);
if (!io->io_deferred)
io = NULL;
else
io->io_deferred = 0;
}
spin_unlock_irqrestore(&log->io_list_lock, flags);
if (io)
r5l_do_submit_io(log, io);
}
static void r5c_disable_writeback_async(struct work_struct *work)
{
struct r5l_log *log = container_of(work, struct r5l_log,
disable_writeback_work);
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
int locked = 0;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
mdname(mddev));
/* wait superblock change before suspend */
wait_event(mddev->sb_wait,
conf->log == NULL ||
(!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
(locked = mddev_trylock(mddev))));
if (locked) {
mddev_suspend(mddev);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
mddev_unlock(mddev);
}
}
static void r5l_submit_current_io(struct r5l_log *log)
{
struct r5l_io_unit *io = log->current_io;
struct r5l_meta_block *block;
unsigned long flags;
u32 crc;
bool do_submit = true;
if (!io)
return;
block = page_address(io->meta_page);
block->meta_size = cpu_to_le32(io->meta_offset);
crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
block->checksum = cpu_to_le32(crc);
log->current_io = NULL;
spin_lock_irqsave(&log->io_list_lock, flags);
if (io->has_flush || io->has_fua) {
if (io != list_first_entry(&log->running_ios,
struct r5l_io_unit, log_sibling)) {
io->io_deferred = 1;
do_submit = false;
}
}
spin_unlock_irqrestore(&log->io_list_lock, flags);
if (do_submit)
r5l_do_submit_io(log, io);
}
static struct bio *r5l_bio_alloc(struct r5l_log *log)
{
struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio_set_dev(bio, log->rdev->bdev);
bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
return bio;
}
static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
{
log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
r5c_update_log_state(log);
/*
* If we filled up the log device start from the beginning again,
* which will require a new bio.
*
* Note: for this to work properly the log size needs to me a multiple
* of BLOCK_SECTORS.
*/
if (log->log_start == 0)
io->need_split_bio = true;
io->log_end = log->log_start;
}
static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
{
struct r5l_io_unit *io;
struct r5l_meta_block *block;
io = mempool_alloc(&log->io_pool, GFP_ATOMIC);
if (!io)
return NULL;
memset(io, 0, sizeof(*io));
io->log = log;
INIT_LIST_HEAD(&io->log_sibling);
INIT_LIST_HEAD(&io->stripe_list);
bio_list_init(&io->flush_barriers);
io->state = IO_UNIT_RUNNING;
io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO);
block = page_address(io->meta_page);
clear_page(block);
block->magic = cpu_to_le32(R5LOG_MAGIC);
block->version = R5LOG_VERSION;
block->seq = cpu_to_le64(log->seq);
block->position = cpu_to_le64(log->log_start);
io->log_start = log->log_start;
io->meta_offset = sizeof(struct r5l_meta_block);
io->seq = log->seq++;
io->current_bio = r5l_bio_alloc(log);
io->current_bio->bi_end_io = r5l_log_endio;
io->current_bio->bi_private = io;
bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
r5_reserve_log_entry(log, io);
spin_lock_irq(&log->io_list_lock);
list_add_tail(&io->log_sibling, &log->running_ios);
spin_unlock_irq(&log->io_list_lock);
return io;
}
static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
{
if (log->current_io &&
log->current_io->meta_offset + payload_size > PAGE_SIZE)
r5l_submit_current_io(log);
if (!log->current_io) {
log->current_io = r5l_new_meta(log);
if (!log->current_io)
return -ENOMEM;
}
return 0;
}
static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
sector_t location,
u32 checksum1, u32 checksum2,
bool checksum2_valid)
{
struct r5l_io_unit *io = log->current_io;
struct r5l_payload_data_parity *payload;
payload = page_address(io->meta_page) + io->meta_offset;
payload->header.type = cpu_to_le16(type);
payload->header.flags = cpu_to_le16(0);
payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
(PAGE_SHIFT - 9));
payload->location = cpu_to_le64(location);
payload->checksum[0] = cpu_to_le32(checksum1);
if (checksum2_valid)
payload->checksum[1] = cpu_to_le32(checksum2);
io->meta_offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) * (1 + !!checksum2_valid);
}
static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
{
struct r5l_io_unit *io = log->current_io;
if (io->need_split_bio) {
BUG_ON(io->split_bio);
io->split_bio = io->current_bio;
io->current_bio = r5l_bio_alloc(log);
bio_chain(io->current_bio, io->split_bio);
io->need_split_bio = false;
}
if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
BUG();
r5_reserve_log_entry(log, io);
}
static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
{
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
struct r5l_io_unit *io;
struct r5l_payload_flush *payload;
int meta_size;
/*
* payload_flush requires extra writes to the journal.
* To avoid handling the extra IO in quiesce, just skip
* flush_payload
*/
if (conf->quiesce)
return;
mutex_lock(&log->io_mutex);
meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
if (r5l_get_meta(log, meta_size)) {
mutex_unlock(&log->io_mutex);
return;
}
/* current implementation is one stripe per flush payload */
io = log->current_io;
payload = page_address(io->meta_page) + io->meta_offset;
payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
payload->header.flags = cpu_to_le16(0);
payload->size = cpu_to_le32(sizeof(__le64));
payload->flush_stripes[0] = cpu_to_le64(sect);
io->meta_offset += meta_size;
/* multiple flush payloads count as one pending_stripe */
if (!io->has_flush_payload) {
io->has_flush_payload = 1;
atomic_inc(&io->pending_stripe);
}
mutex_unlock(&log->io_mutex);
}
static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
int data_pages, int parity_pages)
{
int i;
int meta_size;
int ret;
struct r5l_io_unit *io;
meta_size =
((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
* data_pages) +
sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) * parity_pages;
ret = r5l_get_meta(log, meta_size);
if (ret)
return ret;
io = log->current_io;
if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
io->has_flush = 1;
for (i = 0; i < sh->disks; i++) {
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
test_bit(R5_InJournal, &sh->dev[i].flags))
continue;
if (i == sh->pd_idx || i == sh->qd_idx)
continue;
if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
io->has_fua = 1;
/*
* we need to flush journal to make sure recovery can
* reach the data with fua flag
*/
io->has_flush = 1;
}
r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
raid5_compute_blocknr(sh, i, 0),
sh->dev[i].log_checksum, 0, false);
r5l_append_payload_page(log, sh->dev[i].page);
}
if (parity_pages == 2) {
r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
sh->sector, sh->dev[sh->pd_idx].log_checksum,
sh->dev[sh->qd_idx].log_checksum, true);
r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
} else if (parity_pages == 1) {
r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
sh->sector, sh->dev[sh->pd_idx].log_checksum,
0, false);
r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
} else /* Just writing data, not parity, in caching phase */
BUG_ON(parity_pages != 0);
list_add_tail(&sh->log_list, &io->stripe_list);
atomic_inc(&io->pending_stripe);
sh->log_io = io;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return 0;
if (sh->log_start == MaxSector) {
BUG_ON(!list_empty(&sh->r5c));
sh->log_start = io->log_start;
spin_lock_irq(&log->stripe_in_journal_lock);
list_add_tail(&sh->r5c,
&log->stripe_in_journal_list);
spin_unlock_irq(&log->stripe_in_journal_lock);
atomic_inc(&log->stripe_in_journal_count);
}
return 0;
}
/* add stripe to no_space_stripes, and then wake up reclaim */
static inline void r5l_add_no_space_stripe(struct r5l_log *log,
struct stripe_head *sh)
{
spin_lock(&log->no_space_stripes_lock);
list_add_tail(&sh->log_list, &log->no_space_stripes);
spin_unlock(&log->no_space_stripes_lock);
}
/*
* running in raid5d, where reclaim could wait for raid5d too (when it flushes
* data from log to raid disks), so we shouldn't wait for reclaim here
*/
int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
int write_disks = 0;
int data_pages, parity_pages;
int reserve;
int i;
int ret = 0;
bool wake_reclaim = false;
if (!log)
return -EAGAIN;
/* Don't support stripe batch */
if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
test_bit(STRIPE_SYNCING, &sh->state)) {
/* the stripe is written to log, we start writing it to raid */
clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
return -EAGAIN;
}
WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
for (i = 0; i < sh->disks; i++) {
void *addr;
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
test_bit(R5_InJournal, &sh->dev[i].flags))
continue;
write_disks++;
/* checksum is already calculated in last run */
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
continue;
addr = kmap_atomic(sh->dev[i].page);
sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
addr, PAGE_SIZE);
kunmap_atomic(addr);
}
parity_pages = 1 + !!(sh->qd_idx >= 0);
data_pages = write_disks - parity_pages;
set_bit(STRIPE_LOG_TRAPPED, &sh->state);
/*
* The stripe must enter state machine again to finish the write, so
* don't delay.
*/
clear_bit(STRIPE_DELAYED, &sh->state);
atomic_inc(&sh->count);
mutex_lock(&log->io_mutex);
/* meta + data */
reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
if (!r5l_has_free_space(log, reserve)) {
r5l_add_no_space_stripe(log, sh);
wake_reclaim = true;
} else {
ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
if (ret) {
spin_lock_irq(&log->io_list_lock);
list_add_tail(&sh->log_list,
&log->no_mem_stripes);
spin_unlock_irq(&log->io_list_lock);
}
}
} else { /* R5C_JOURNAL_MODE_WRITE_BACK */
/*
* log space critical, do not process stripes that are
* not in cache yet (sh->log_start == MaxSector).
*/
if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
sh->log_start == MaxSector) {
r5l_add_no_space_stripe(log, sh);
wake_reclaim = true;
reserve = 0;
} else if (!r5l_has_free_space(log, reserve)) {
if (sh->log_start == log->last_checkpoint)
BUG();
else
r5l_add_no_space_stripe(log, sh);
} else {
ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
if (ret) {
spin_lock_irq(&log->io_list_lock);
list_add_tail(&sh->log_list,
&log->no_mem_stripes);
spin_unlock_irq(&log->io_list_lock);
}
}
}
mutex_unlock(&log->io_mutex);
if (wake_reclaim)
r5l_wake_reclaim(log, reserve);
return 0;
}
void r5l_write_stripe_run(struct r5l_log *log)
{
if (!log)
return;
mutex_lock(&log->io_mutex);
r5l_submit_current_io(log);
mutex_unlock(&log->io_mutex);
}
int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
{
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
/*
* in write through (journal only)
* we flush log disk cache first, then write stripe data to
* raid disks. So if bio is finished, the log disk cache is
* flushed already. The recovery guarantees we can recovery
* the bio from log disk, so we don't need to flush again
*/
if (bio->bi_iter.bi_size == 0) {
bio_endio(bio);
return 0;
}
bio->bi_opf &= ~REQ_PREFLUSH;
} else {
/* write back (with cache) */
if (bio->bi_iter.bi_size == 0) {
mutex_lock(&log->io_mutex);
r5l_get_meta(log, 0);
bio_list_add(&log->current_io->flush_barriers, bio);
log->current_io->has_flush = 1;
log->current_io->has_null_flush = 1;
atomic_inc(&log->current_io->pending_stripe);
r5l_submit_current_io(log);
mutex_unlock(&log->io_mutex);
return 0;
}
}
return -EAGAIN;
}
/* This will run after log space is reclaimed */
static void r5l_run_no_space_stripes(struct r5l_log *log)
{
struct stripe_head *sh;
spin_lock(&log->no_space_stripes_lock);
while (!list_empty(&log->no_space_stripes)) {
sh = list_first_entry(&log->no_space_stripes,
struct stripe_head, log_list);
list_del_init(&sh->log_list);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
spin_unlock(&log->no_space_stripes_lock);
}
/*
* calculate new last_checkpoint
* for write through mode, returns log->next_checkpoint
* for write back, returns log_start of first sh in stripe_in_journal_list
*/
static sector_t r5c_calculate_new_cp(struct r5conf *conf)
{
struct stripe_head *sh;
struct r5l_log *log = conf->log;
sector_t new_cp;
unsigned long flags;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return log->next_checkpoint;
spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
if (list_empty(&conf->log->stripe_in_journal_list)) {
/* all stripes flushed */
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
return log->next_checkpoint;
}
sh = list_first_entry(&conf->log->stripe_in_journal_list,
struct stripe_head, r5c);
new_cp = sh->log_start;
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
return new_cp;
}
static sector_t r5l_reclaimable_space(struct r5l_log *log)
{
struct r5conf *conf = log->rdev->mddev->private;
return r5l_ring_distance(log, log->last_checkpoint,
r5c_calculate_new_cp(conf));
}
static void r5l_run_no_mem_stripe(struct r5l_log *log)
{
struct stripe_head *sh;
lockdep_assert_held(&log->io_list_lock);
if (!list_empty(&log->no_mem_stripes)) {
sh = list_first_entry(&log->no_mem_stripes,
struct stripe_head, log_list);
list_del_init(&sh->log_list);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
}
static bool r5l_complete_finished_ios(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
bool found = false;
lockdep_assert_held(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
/* don't change list order */
if (io->state < IO_UNIT_STRIPE_END)
break;
log->next_checkpoint = io->log_start;
list_del(&io->log_sibling);
mempool_free(io, &log->io_pool);
r5l_run_no_mem_stripe(log);
found = true;
}
return found;
}
static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
{
struct r5l_log *log = io->log;
struct r5conf *conf = log->rdev->mddev->private;
unsigned long flags;
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
if (!r5l_complete_finished_ios(log)) {
spin_unlock_irqrestore(&log->io_list_lock, flags);
return;
}
if (r5l_reclaimable_space(log) > log->max_free_space ||
test_bit(R5C_LOG_TIGHT, &conf->cache_state))
r5l_wake_reclaim(log, 0);
spin_unlock_irqrestore(&log->io_list_lock, flags);
wake_up(&log->iounit_wait);
}
void r5l_stripe_write_finished(struct stripe_head *sh)
{
struct r5l_io_unit *io;
io = sh->log_io;
sh->log_io = NULL;
if (io && atomic_dec_and_test(&io->pending_stripe))
__r5l_stripe_write_finished(io);
}
static void r5l_log_flush_endio(struct bio *bio)
{
struct r5l_log *log = container_of(bio, struct r5l_log,
flush_bio);
unsigned long flags;
struct r5l_io_unit *io;
if (bio->bi_status)
md_error(log->rdev->mddev, log->rdev);
spin_lock_irqsave(&log->io_list_lock, flags);
list_for_each_entry(io, &log->flushing_ios, log_sibling)
r5l_io_run_stripes(io);
list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
spin_unlock_irqrestore(&log->io_list_lock, flags);
}
/*
* Starting dispatch IO to raid.
* io_unit(meta) consists of a log. There is one situation we want to avoid. A
* broken meta in the middle of a log causes recovery can't find meta at the
* head of log. If operations require meta at the head persistent in log, we
* must make sure meta before it persistent in log too. A case is:
*
* stripe data/parity is in log, we start write stripe to raid disks. stripe
* data/parity must be persistent in log before we do the write to raid disks.
*
* The solution is we restrictly maintain io_unit list order. In this case, we
* only write stripes of an io_unit to raid disks till the io_unit is the first
* one whose data/parity is in log.
*/
void r5l_flush_stripe_to_raid(struct r5l_log *log)
{
bool do_flush;
if (!log || !log->need_cache_flush)
return;
spin_lock_irq(&log->io_list_lock);
/* flush bio is running */
if (!list_empty(&log->flushing_ios)) {
spin_unlock_irq(&log->io_list_lock);
return;
}
list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
do_flush = !list_empty(&log->flushing_ios);
spin_unlock_irq(&log->io_list_lock);
if (!do_flush)
return;
bio_reset(&log->flush_bio);
bio_set_dev(&log->flush_bio, log->rdev->bdev);
log->flush_bio.bi_end_io = r5l_log_flush_endio;
log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
submit_bio(&log->flush_bio);
}
static void r5l_write_super(struct r5l_log *log, sector_t cp);
static void r5l_write_super_and_discard_space(struct r5l_log *log,
sector_t end)
{
struct block_device *bdev = log->rdev->bdev;
struct mddev *mddev;
r5l_write_super(log, end);
if (!blk_queue_discard(bdev_get_queue(bdev)))
return;
mddev = log->rdev->mddev;
/*
* Discard could zero data, so before discard we must make sure
* superblock is updated to new log tail. Updating superblock (either
* directly call md_update_sb() or depend on md thread) must hold
* reconfig mutex. On the other hand, raid5_quiesce is called with
* reconfig_mutex hold. The first step of raid5_quiesce() is waitting
* for all IO finish, hence waitting for reclaim thread, while reclaim
* thread is calling this function and waitting for reconfig mutex. So
* there is a deadlock. We workaround this issue with a trylock.
* FIXME: we could miss discard if we can't take reconfig mutex
*/
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
if (!mddev_trylock(mddev))
return;
md_update_sb(mddev, 1);
mddev_unlock(mddev);
/* discard IO error really doesn't matter, ignore it */
if (log->last_checkpoint < end) {
blkdev_issue_discard(bdev,
log->last_checkpoint + log->rdev->data_offset,
end - log->last_checkpoint, GFP_NOIO, 0);
} else {
blkdev_issue_discard(bdev,
log->last_checkpoint + log->rdev->data_offset,
log->device_size - log->last_checkpoint,
GFP_NOIO, 0);
blkdev_issue_discard(bdev, log->rdev->data_offset, end,
GFP_NOIO, 0);
}
}
/*
* r5c_flush_stripe moves stripe from cached list to handle_list. When called,
* the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
*
* must hold conf->device_lock
*/
static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
{
BUG_ON(list_empty(&sh->lru));
BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
/*
* The stripe is not ON_RELEASE_LIST, so it is safe to call
* raid5_release_stripe() while holding conf->device_lock
*/
BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
lockdep_assert_held(&conf->device_lock);
list_del_init(&sh->lru);
atomic_inc(&sh->count);
set_bit(STRIPE_HANDLE, &sh->state);
atomic_inc(&conf->active_stripes);
r5c_make_stripe_write_out(sh);
if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
atomic_inc(&conf->r5c_flushing_partial_stripes);
else
atomic_inc(&conf->r5c_flushing_full_stripes);
raid5_release_stripe(sh);
}
/*
* if num == 0, flush all full stripes
* if num > 0, flush all full stripes. If less than num full stripes are
* flushed, flush some partial stripes until totally num stripes are
* flushed or there is no more cached stripes.
*/
void r5c_flush_cache(struct r5conf *conf, int num)
{
int count;
struct stripe_head *sh, *next;
lockdep_assert_held(&conf->device_lock);
if (!conf->log)
return;
count = 0;
list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
r5c_flush_stripe(conf, sh);
count++;
}
if (count >= num)
return;
list_for_each_entry_safe(sh, next,
&conf->r5c_partial_stripe_list, lru) {
r5c_flush_stripe(conf, sh);
if (++count >= num)
break;
}
}
static void r5c_do_reclaim(struct r5conf *conf)
{
struct r5l_log *log = conf->log;
struct stripe_head *sh;
int count = 0;
unsigned long flags;
int total_cached;
int stripes_to_flush;
int flushing_partial, flushing_full;
if (!r5c_is_writeback(log))
return;
flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
atomic_read(&conf->r5c_cached_full_stripes) -
flushing_full - flushing_partial;
if (total_cached > conf->min_nr_stripes * 3 / 4 ||
atomic_read(&conf->empty_inactive_list_nr) > 0)
/*
* if stripe cache pressure high, flush all full stripes and
* some partial stripes
*/
stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
R5C_FULL_STRIPE_FLUSH_BATCH(conf))
/*
* if stripe cache pressure moderate, or if there is many full
* stripes,flush all full stripes
*/
stripes_to_flush = 0;
else
/* no need to flush */
stripes_to_flush = -1;
if (stripes_to_flush >= 0) {
spin_lock_irqsave(&conf->device_lock, flags);
r5c_flush_cache(conf, stripes_to_flush);
spin_unlock_irqrestore(&conf->device_lock, flags);
}
/* if log space is tight, flush stripes on stripe_in_journal_list */
if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
spin_lock(&conf->device_lock);
list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
/*
* stripes on stripe_in_journal_list could be in any
* state of the stripe_cache state machine. In this
* case, we only want to flush stripe on
* r5c_cached_full/partial_stripes. The following
* condition makes sure the stripe is on one of the
* two lists.
*/
if (!list_empty(&sh->lru) &&
!test_bit(STRIPE_HANDLE, &sh->state) &&
atomic_read(&sh->count) == 0) {
r5c_flush_stripe(conf, sh);
if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
break;
}
}
spin_unlock(&conf->device_lock);
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
}
if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
r5l_run_no_space_stripes(log);
md_wakeup_thread(conf->mddev->thread);
}
static void r5l_do_reclaim(struct r5l_log *log)
{
struct r5conf *conf = log->rdev->mddev->private;
sector_t reclaim_target = xchg(&log->reclaim_target, 0);
sector_t reclaimable;
sector_t next_checkpoint;
bool write_super;
spin_lock_irq(&log->io_list_lock);
write_super = r5l_reclaimable_space(log) > log->max_free_space ||
reclaim_target != 0 || !list_empty(&log->no_space_stripes);
/*
* move proper io_unit to reclaim list. We should not change the order.
* reclaimable/unreclaimable io_unit can be mixed in the list, we
* shouldn't reuse space of an unreclaimable io_unit
*/
while (1) {
reclaimable = r5l_reclaimable_space(log);
if (reclaimable >= reclaim_target ||
(list_empty(&log->running_ios) &&
list_empty(&log->io_end_ios) &&
list_empty(&log->flushing_ios) &&
list_empty(&log->finished_ios)))
break;
md_wakeup_thread(log->rdev->mddev->thread);
wait_event_lock_irq(log->iounit_wait,
r5l_reclaimable_space(log) > reclaimable,
log->io_list_lock);
}
next_checkpoint = r5c_calculate_new_cp(conf);
spin_unlock_irq(&log->io_list_lock);
if (reclaimable == 0 || !write_super)
return;
/*
* write_super will flush cache of each raid disk. We must write super
* here, because the log area might be reused soon and we don't want to
* confuse recovery
*/
r5l_write_super_and_discard_space(log, next_checkpoint);
mutex_lock(&log->io_mutex);
log->last_checkpoint = next_checkpoint;
r5c_update_log_state(log);
mutex_unlock(&log->io_mutex);
r5l_run_no_space_stripes(log);
}
static void r5l_reclaim_thread(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
struct r5l_log *log = conf->log;
if (!log)
return;
r5c_do_reclaim(conf);
r5l_do_reclaim(log);
}
void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
{
unsigned long target;
unsigned long new = (unsigned long)space; /* overflow in theory */
if (!log)
return;
do {
target = log->reclaim_target;
if (new < target)
return;
} while (cmpxchg(&log->reclaim_target, target, new) != target);
md_wakeup_thread(log->reclaim_thread);
}
void r5l_quiesce(struct r5l_log *log, int quiesce)
{
struct mddev *mddev;
if (quiesce) {
/* make sure r5l_write_super_and_discard_space exits */
mddev = log->rdev->mddev;
wake_up(&mddev->sb_wait);
kthread_park(log->reclaim_thread->tsk);
r5l_wake_reclaim(log, MaxSector);
r5l_do_reclaim(log);
} else
kthread_unpark(log->reclaim_thread->tsk);
}
bool r5l_log_disk_error(struct r5conf *conf)
{
struct r5l_log *log;
bool ret;
/* don't allow write if journal disk is missing */
rcu_read_lock();
log = rcu_dereference(conf->log);
if (!log)
ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
else
ret = test_bit(Faulty, &log->rdev->flags);
rcu_read_unlock();
return ret;
}
#define R5L_RECOVERY_PAGE_POOL_SIZE 256
struct r5l_recovery_ctx {
struct page *meta_page; /* current meta */
sector_t meta_total_blocks; /* total size of current meta and data */
sector_t pos; /* recovery position */
u64 seq; /* recovery position seq */
int data_parity_stripes; /* number of data_parity stripes */
int data_only_stripes; /* number of data_only stripes */
struct list_head cached_list;
/*
* read ahead page pool (ra_pool)
* in recovery, log is read sequentially. It is not efficient to
* read every page with sync_page_io(). The read ahead page pool
* reads multiple pages with one IO, so further log read can
* just copy data from the pool.
*/
struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
sector_t pool_offset; /* offset of first page in the pool */
int total_pages; /* total allocated pages */
int valid_pages; /* pages with valid data */
struct bio *ra_bio; /* bio to do the read ahead */
};
static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct page *page;
ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs);
if (!ctx->ra_bio)
return -ENOMEM;
ctx->valid_pages = 0;
ctx->total_pages = 0;
while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
page = alloc_page(GFP_KERNEL);
if (!page)
break;
ctx->ra_pool[ctx->total_pages] = page;
ctx->total_pages += 1;
}
if (ctx->total_pages == 0) {
bio_put(ctx->ra_bio);
return -ENOMEM;
}
ctx->pool_offset = 0;
return 0;
}
static void r5l_recovery_free_ra_pool(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
int i;
for (i = 0; i < ctx->total_pages; ++i)
put_page(ctx->ra_pool[i]);
bio_put(ctx->ra_bio);
}
/*
* fetch ctx->valid_pages pages from offset
* In normal cases, ctx->valid_pages == ctx->total_pages after the call.
* However, if the offset is close to the end of the journal device,
* ctx->valid_pages could be smaller than ctx->total_pages
*/
static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
struct r5l_recovery_ctx *ctx,
sector_t offset)
{
bio_reset(ctx->ra_bio);
bio_set_dev(ctx->ra_bio, log->rdev->bdev);
bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
ctx->valid_pages = 0;
ctx->pool_offset = offset;
while (ctx->valid_pages < ctx->total_pages) {
bio_add_page(ctx->ra_bio,
ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
ctx->valid_pages += 1;
offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
if (offset == 0) /* reached end of the device */
break;
}
return submit_bio_wait(ctx->ra_bio);
}
/*
* try read a page from the read ahead page pool, if the page is not in the
* pool, call r5l_recovery_fetch_ra_pool
*/
static int r5l_recovery_read_page(struct r5l_log *log,
struct r5l_recovery_ctx *ctx,
struct page *page,
sector_t offset)
{
int ret;
if (offset < ctx->pool_offset ||
offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
if (ret)
return ret;
}
BUG_ON(offset < ctx->pool_offset ||
offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
memcpy(page_address(page),
page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
BLOCK_SECTOR_SHIFT]),
PAGE_SIZE);
return 0;
}
static int r5l_recovery_read_meta_block(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct page *page = ctx->meta_page;
struct r5l_meta_block *mb;
u32 crc, stored_crc;
int ret;
ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
if (ret != 0)
return ret;
mb = page_address(page);
stored_crc = le32_to_cpu(mb->checksum);
mb->checksum = 0;
if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
le64_to_cpu(mb->seq) != ctx->seq ||
mb->version != R5LOG_VERSION ||
le64_to_cpu(mb->position) != ctx->pos)
return -EINVAL;
crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
if (stored_crc != crc)
return -EINVAL;
if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
return -EINVAL;
ctx->meta_total_blocks = BLOCK_SECTORS;
return 0;
}
static void
r5l_recovery_create_empty_meta_block(struct r5l_log *log,
struct page *page,
sector_t pos, u64 seq)
{
struct r5l_meta_block *mb;
mb = page_address(page);
clear_page(mb);
mb->magic = cpu_to_le32(R5LOG_MAGIC);
mb->version = R5LOG_VERSION;
mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
mb->seq = cpu_to_le64(seq);
mb->position = cpu_to_le64(pos);
}
static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
u64 seq)
{
struct page *page;
struct r5l_meta_block *mb;
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
r5l_recovery_create_empty_meta_block(log, page, pos, seq);
mb = page_address(page);
mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
mb, PAGE_SIZE));
if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
REQ_SYNC | REQ_FUA, false)) {
__free_page(page);
return -EIO;
}
__free_page(page);
return 0;
}
/*
* r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
* to mark valid (potentially not flushed) data in the journal.
*
* We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
* so there should not be any mismatch here.
*/
static void r5l_recovery_load_data(struct r5l_log *log,
struct stripe_head *sh,
struct r5l_recovery_ctx *ctx,
struct r5l_payload_data_parity *payload,
sector_t log_offset)
{
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
int dd_idx;
raid5_compute_sector(conf,
le64_to_cpu(payload->location), 0,
&dd_idx, sh);
r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
sh->dev[dd_idx].log_checksum =
le32_to_cpu(payload->checksum[0]);
ctx->meta_total_blocks += BLOCK_SECTORS;
set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
set_bit(STRIPE_R5C_CACHING, &sh->state);
}
static void r5l_recovery_load_parity(struct r5l_log *log,
struct stripe_head *sh,
struct r5l_recovery_ctx *ctx,
struct r5l_payload_data_parity *payload,
sector_t log_offset)
{
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
sh->dev[sh->pd_idx].log_checksum =
le32_to_cpu(payload->checksum[0]);
set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
if (sh->qd_idx >= 0) {
r5l_recovery_read_page(
log, ctx, sh->dev[sh->qd_idx].page,
r5l_ring_add(log, log_offset, BLOCK_SECTORS));
sh->dev[sh->qd_idx].log_checksum =
le32_to_cpu(payload->checksum[1]);
set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
}
clear_bit(STRIPE_R5C_CACHING, &sh->state);
}
static void r5l_recovery_reset_stripe(struct stripe_head *sh)
{
int i;
sh->state = 0;
sh->log_start = MaxSector;
for (i = sh->disks; i--; )
sh->dev[i].flags = 0;
}
static void
r5l_recovery_replay_one_stripe(struct r5conf *conf,
struct stripe_head *sh,
struct r5l_recovery_ctx *ctx)
{
struct md_rdev *rdev, *rrdev;
int disk_index;
int data_count = 0;
for (disk_index = 0; disk_index < sh->disks; disk_index++) {
if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
continue;
if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
continue;
data_count++;
}
/*
* stripes that only have parity must have been flushed
* before the crash that we are now recovering from, so
* there is nothing more to recovery.
*/
if (data_count == 0)
goto out;
for (disk_index = 0; disk_index < sh->disks; disk_index++) {
if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
continue;
/* in case device is broken */
rcu_read_lock();
rdev = rcu_dereference(conf->disks[disk_index].rdev);
if (rdev) {
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
sync_page_io(rdev, sh->sector, PAGE_SIZE,
sh->dev[disk_index].page, REQ_OP_WRITE, 0,
false);
rdev_dec_pending(rdev, rdev->mddev);
rcu_read_lock();
}
rrdev = rcu_dereference(conf->disks[disk_index].replacement);
if (rrdev) {
atomic_inc(&rrdev->nr_pending);
rcu_read_unlock();
sync_page_io(rrdev, sh->sector, PAGE_SIZE,
sh->dev[disk_index].page, REQ_OP_WRITE, 0,
false);
rdev_dec_pending(rrdev, rrdev->mddev);
rcu_read_lock();
}
rcu_read_unlock();
}
ctx->data_parity_stripes++;
out:
r5l_recovery_reset_stripe(sh);
}
static struct stripe_head *
r5c_recovery_alloc_stripe(
struct r5conf *conf,
sector_t stripe_sect,
int noblock)
{
struct stripe_head *sh;
sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0);
if (!sh)
return NULL; /* no more stripe available */
r5l_recovery_reset_stripe(sh);
return sh;
}
static struct stripe_head *
r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
{
struct stripe_head *sh;
list_for_each_entry(sh, list, lru)
if (sh->sector == sect)
return sh;
return NULL;
}
static void
r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
struct r5l_recovery_ctx *ctx)
{
struct stripe_head *sh, *next;
list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
r5l_recovery_reset_stripe(sh);
list_del_init(&sh->lru);
raid5_release_stripe(sh);
}
}
static void
r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
struct r5l_recovery_ctx *ctx)
{
struct stripe_head *sh, *next;
list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
list_del_init(&sh->lru);
raid5_release_stripe(sh);
}
}
/* if matches return 0; otherwise return -EINVAL */
static int
r5l_recovery_verify_data_checksum(struct r5l_log *log,
struct r5l_recovery_ctx *ctx,
struct page *page,
sector_t log_offset, __le32 log_checksum)
{
void *addr;
u32 checksum;
r5l_recovery_read_page(log, ctx, page, log_offset);
addr = kmap_atomic(page);
checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
kunmap_atomic(addr);
return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
}
/*
* before loading data to stripe cache, we need verify checksum for all data,
* if there is mismatch for any data page, we drop all data in the mata block
*/
static int
r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
struct r5l_meta_block *mb = page_address(ctx->meta_page);
sector_t mb_offset = sizeof(struct r5l_meta_block);
sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
struct page *page;
struct r5l_payload_data_parity *payload;
struct r5l_payload_flush *payload_flush;
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
while (mb_offset < le32_to_cpu(mb->meta_size)) {
payload = (void *)mb + mb_offset;
payload_flush = (void *)mb + mb_offset;
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
if (r5l_recovery_verify_data_checksum(
log, ctx, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
if (r5l_recovery_verify_data_checksum(
log, ctx, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
if (conf->max_degraded == 2 && /* q for RAID 6 */
r5l_recovery_verify_data_checksum(
log, ctx, page,
r5l_ring_add(log, log_offset,
BLOCK_SECTORS),
payload->checksum[1]) < 0)
goto mismatch;
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
goto mismatch;
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
mb_offset += sizeof(struct r5l_payload_flush) +
le32_to_cpu(payload_flush->size);
} else {
/* DATA or PARITY payload */
log_offset = r5l_ring_add(log, log_offset,
le32_to_cpu(payload->size));
mb_offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
}
}
put_page(page);
return 0;
mismatch:
put_page(page);
return -EINVAL;
}
/*
* Analyze all data/parity pages in one meta block
* Returns:
* 0 for success
* -EINVAL for unknown playload type
* -EAGAIN for checksum mismatch of data page
* -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
*/
static int
r5c_recovery_analyze_meta_block(struct r5l_log *log,
struct r5l_recovery_ctx *ctx,
struct list_head *cached_stripe_list)
{
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
struct r5l_meta_block *mb;
struct r5l_payload_data_parity *payload;
struct r5l_payload_flush *payload_flush;
int mb_offset;
sector_t log_offset;
sector_t stripe_sect;
struct stripe_head *sh;
int ret;
/*
* for mismatch in data blocks, we will drop all data in this mb, but
* we will still read next mb for other data with FLUSH flag, as
* io_unit could finish out of order.
*/
ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
if (ret == -EINVAL)
return -EAGAIN;
else if (ret)
return ret; /* -ENOMEM duo to alloc_page() failed */
mb = page_address(ctx->meta_page);
mb_offset = sizeof(struct r5l_meta_block);
log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
while (mb_offset < le32_to_cpu(mb->meta_size)) {
int dd;
payload = (void *)mb + mb_offset;
payload_flush = (void *)mb + mb_offset;
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
int i, count;
count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
for (i = 0; i < count; ++i) {
stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
sh = r5c_recovery_lookup_stripe(cached_stripe_list,
stripe_sect);
if (sh) {
WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
r5l_recovery_reset_stripe(sh);
list_del_init(&sh->lru);
raid5_release_stripe(sh);
}
}
mb_offset += sizeof(struct r5l_payload_flush) +
le32_to_cpu(payload_flush->size);
continue;
}
/* DATA or PARITY payload */
stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
raid5_compute_sector(
conf, le64_to_cpu(payload->location), 0, &dd,
NULL)
: le64_to_cpu(payload->location);
sh = r5c_recovery_lookup_stripe(cached_stripe_list,
stripe_sect);
if (!sh) {
sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1);
/*
* cannot get stripe from raid5_get_active_stripe
* try replay some stripes
*/
if (!sh) {
r5c_recovery_replay_stripes(
cached_stripe_list, ctx);
sh = r5c_recovery_alloc_stripe(
conf, stripe_sect, 1);
}
if (!sh) {
int new_size = conf->min_nr_stripes * 2;
pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
mdname(mddev),
new_size);
ret = raid5_set_cache_size(mddev, new_size);
if (conf->min_nr_stripes <= new_size / 2) {
pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n",
mdname(mddev),
ret,
new_size,
conf->min_nr_stripes,
conf->max_nr_stripes);
return -ENOMEM;
}
sh = r5c_recovery_alloc_stripe(
conf, stripe_sect, 0);
}
if (!sh) {
pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
mdname(mddev));
return -ENOMEM;
}
list_add_tail(&sh->lru, cached_stripe_list);
}
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
r5l_recovery_replay_one_stripe(conf, sh, ctx);
list_move_tail(&sh->lru, cached_stripe_list);
}
r5l_recovery_load_data(log, sh, ctx, payload,
log_offset);
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
r5l_recovery_load_parity(log, sh, ctx, payload,
log_offset);
else
return -EINVAL;
log_offset = r5l_ring_add(log, log_offset,
le32_to_cpu(payload->size));
mb_offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
}
return 0;
}
/*
* Load the stripe into cache. The stripe will be written out later by
* the stripe cache state machine.
*/
static void r5c_recovery_load_one_stripe(struct r5l_log *log,
struct stripe_head *sh)
{
struct r5dev *dev;
int i;
for (i = sh->disks; i--; ) {
dev = sh->dev + i;
if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
set_bit(R5_InJournal, &dev->flags);
set_bit(R5_UPTODATE, &dev->flags);
}
}
}
/*
* Scan through the log for all to-be-flushed data
*
* For stripes with data and parity, namely Data-Parity stripe
* (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
*
* For stripes with only data, namely Data-Only stripe
* (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
*
* For a stripe, if we see data after parity, we should discard all previous
* data and parity for this stripe, as these data are already flushed to
* the array.
*
* At the end of the scan, we return the new journal_tail, which points to
* first data-only stripe on the journal device, or next invalid meta block.
*/
static int r5c_recovery_flush_log(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct stripe_head *sh;
int ret = 0;
/* scan through the log */
while (1) {
if (r5l_recovery_read_meta_block(log, ctx))
break;
ret = r5c_recovery_analyze_meta_block(log, ctx,
&ctx->cached_list);
/*
* -EAGAIN means mismatch in data block, in this case, we still
* try scan the next metablock
*/
if (ret && ret != -EAGAIN)
break; /* ret == -EINVAL or -ENOMEM */
ctx->seq++;
ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
}
if (ret == -ENOMEM) {
r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
return ret;
}
/* replay data-parity stripes */
r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
/* load data-only stripes to stripe cache */
list_for_each_entry(sh, &ctx->cached_list, lru) {
WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
r5c_recovery_load_one_stripe(log, sh);
ctx->data_only_stripes++;
}
return 0;
}
/*
* we did a recovery. Now ctx.pos points to an invalid meta block. New
* log will start here. but we can't let superblock point to last valid
* meta block. The log might looks like:
* | meta 1| meta 2| meta 3|
* meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
* superblock points to meta 1, we write a new valid meta 2n. if crash
* happens again, new recovery will start from meta 1. Since meta 2n is
* valid now, recovery will think meta 3 is valid, which is wrong.
* The solution is we create a new meta in meta2 with its seq == meta
* 1's seq + 10000 and let superblock points to meta2. The same recovery
* will not think meta 3 is a valid meta, because its seq doesn't match
*/
/*
* Before recovery, the log looks like the following
*
* ---------------------------------------------
* | valid log | invalid log |
* ---------------------------------------------
* ^
* |- log->last_checkpoint
* |- log->last_cp_seq
*
* Now we scan through the log until we see invalid entry
*
* ---------------------------------------------
* | valid log | invalid log |
* ---------------------------------------------
* ^ ^
* |- log->last_checkpoint |- ctx->pos
* |- log->last_cp_seq |- ctx->seq
*
* From this point, we need to increase seq number by 10 to avoid
* confusing next recovery.
*
* ---------------------------------------------
* | valid log | invalid log |
* ---------------------------------------------
* ^ ^
* |- log->last_checkpoint |- ctx->pos+1
* |- log->last_cp_seq |- ctx->seq+10001
*
* However, it is not safe to start the state machine yet, because data only
* parities are not yet secured in RAID. To save these data only parities, we
* rewrite them from seq+11.
*
* -----------------------------------------------------------------
* | valid log | data only stripes | invalid log |
* -----------------------------------------------------------------
* ^ ^
* |- log->last_checkpoint |- ctx->pos+n
* |- log->last_cp_seq |- ctx->seq+10000+n
*
* If failure happens again during this process, the recovery can safe start
* again from log->last_checkpoint.
*
* Once data only stripes are rewritten to journal, we move log_tail
*
* -----------------------------------------------------------------
* | old log | data only stripes | invalid log |
* -----------------------------------------------------------------
* ^ ^
* |- log->last_checkpoint |- ctx->pos+n
* |- log->last_cp_seq |- ctx->seq+10000+n
*
* Then we can safely start the state machine. If failure happens from this
* point on, the recovery will start from new log->last_checkpoint.
*/
static int
r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct stripe_head *sh;
struct mddev *mddev = log->rdev->mddev;
struct page *page;
sector_t next_checkpoint = MaxSector;
page = alloc_page(GFP_KERNEL);
if (!page) {
pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
mdname(mddev));
return -ENOMEM;
}
WARN_ON(list_empty(&ctx->cached_list));
list_for_each_entry(sh, &ctx->cached_list, lru) {
struct r5l_meta_block *mb;
int i;
int offset;
sector_t write_pos;
WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
r5l_recovery_create_empty_meta_block(log, page,
ctx->pos, ctx->seq);
mb = page_address(page);
offset = le32_to_cpu(mb->meta_size);
write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
struct r5l_payload_data_parity *payload;
void *addr;
if (test_bit(R5_InJournal, &dev->flags)) {
payload = (void *)mb + offset;
payload->header.type = cpu_to_le16(
R5LOG_PAYLOAD_DATA);
payload->size = cpu_to_le32(BLOCK_SECTORS);
payload->location = cpu_to_le64(
raid5_compute_blocknr(sh, i, 0));
addr = kmap_atomic(dev->page);
payload->checksum[0] = cpu_to_le32(
crc32c_le(log->uuid_checksum, addr,
PAGE_SIZE));
kunmap_atomic(addr);
sync_page_io(log->rdev, write_pos, PAGE_SIZE,
dev->page, REQ_OP_WRITE, 0, false);
write_pos = r5l_ring_add(log, write_pos,
BLOCK_SECTORS);
offset += sizeof(__le32) +
sizeof(struct r5l_payload_data_parity);
}
}
mb->meta_size = cpu_to_le32(offset);
mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
mb, PAGE_SIZE));
sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false);
sh->log_start = ctx->pos;
list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
atomic_inc(&log->stripe_in_journal_count);
ctx->pos = write_pos;
ctx->seq += 1;
next_checkpoint = sh->log_start;
}
log->next_checkpoint = next_checkpoint;
__free_page(page);
return 0;
}
static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
struct stripe_head *sh, *next;
if (ctx->data_only_stripes == 0)
return;
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
r5c_make_stripe_write_out(sh);
set_bit(STRIPE_HANDLE, &sh->state);
list_del_init(&sh->lru);
raid5_release_stripe(sh);
}
/* reuse conf->wait_for_quiescent in recovery */
wait_event(conf->wait_for_quiescent,
atomic_read(&conf->active_stripes) == 0);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
}
static int r5l_recovery_log(struct r5l_log *log)
{
struct mddev *mddev = log->rdev->mddev;
struct r5l_recovery_ctx *ctx;
int ret;
sector_t pos;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
ctx->pos = log->last_checkpoint;
ctx->seq = log->last_cp_seq;
INIT_LIST_HEAD(&ctx->cached_list);
ctx->meta_page = alloc_page(GFP_KERNEL);
if (!ctx->meta_page) {
ret = -ENOMEM;
goto meta_page;
}
if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
ret = -ENOMEM;
goto ra_pool;
}
ret = r5c_recovery_flush_log(log, ctx);
if (ret)
goto error;
pos = ctx->pos;
ctx->seq += 10000;
if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
pr_info("md/raid:%s: starting from clean shutdown\n",
mdname(mddev));
else
pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
mdname(mddev), ctx->data_only_stripes,
ctx->data_parity_stripes);
if (ctx->data_only_stripes == 0) {
log->next_checkpoint = ctx->pos;
r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
} else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
mdname(mddev));
ret = -EIO;
goto error;
}
log->log_start = ctx->pos;
log->seq = ctx->seq;
log->last_checkpoint = pos;
r5l_write_super(log, pos);
r5c_recovery_flush_data_only_stripes(log, ctx);
ret = 0;
error:
r5l_recovery_free_ra_pool(log, ctx);
ra_pool:
__free_page(ctx->meta_page);
meta_page:
kfree(ctx);
return ret;
}
static void r5l_write_super(struct r5l_log *log, sector_t cp)
{
struct mddev *mddev = log->rdev->mddev;
log->rdev->journal_tail = cp;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
}
static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
{
struct r5conf *conf;
int ret;
ret = mddev_lock(mddev);
if (ret)
return ret;
conf = mddev->private;
if (!conf || !conf->log) {
mddev_unlock(mddev);
return 0;
}
switch (conf->log->r5c_journal_mode) {
case R5C_JOURNAL_MODE_WRITE_THROUGH:
ret = snprintf(
page, PAGE_SIZE, "[%s] %s\n",
r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
break;
case R5C_JOURNAL_MODE_WRITE_BACK:
ret = snprintf(
page, PAGE_SIZE, "%s [%s]\n",
r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
break;
default:
ret = 0;
}
mddev_unlock(mddev);
return ret;
}
/*
* Set journal cache mode on @mddev (external API initially needed by dm-raid).
*
* @mode as defined in 'enum r5c_journal_mode'.
*
*/
int r5c_journal_mode_set(struct mddev *mddev, int mode)
{
struct r5conf *conf;
if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
mode > R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
conf = mddev->private;
if (!conf || !conf->log)
return -ENODEV;
if (raid5_calc_degraded(conf) > 0 &&
mode == R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
mddev_suspend(mddev);
conf->log->r5c_journal_mode = mode;
mddev_resume(mddev);
pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
mdname(mddev), mode, r5c_journal_mode_str[mode]);
return 0;
}
EXPORT_SYMBOL(r5c_journal_mode_set);
static ssize_t r5c_journal_mode_store(struct mddev *mddev,
const char *page, size_t length)
{
int mode = ARRAY_SIZE(r5c_journal_mode_str);
size_t len = length;
int ret;
if (len < 2)
return -EINVAL;
if (page[len - 1] == '\n')
len--;
while (mode--)
if (strlen(r5c_journal_mode_str[mode]) == len &&
!strncmp(page, r5c_journal_mode_str[mode], len))
break;
ret = mddev_lock(mddev);
if (ret)
return ret;
ret = r5c_journal_mode_set(mddev, mode);
mddev_unlock(mddev);
return ret ?: length;
}
struct md_sysfs_entry
r5c_journal_mode = __ATTR(journal_mode, 0644,
r5c_journal_mode_show, r5c_journal_mode_store);
/*
* Try handle write operation in caching phase. This function should only
* be called in write-back mode.
*
* If all outstanding writes can be handled in caching phase, returns 0
* If writes requires write-out phase, call r5c_make_stripe_write_out()
* and returns -EAGAIN
*/
int r5c_try_caching_write(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s,
int disks)
{
struct r5l_log *log = conf->log;
int i;
struct r5dev *dev;
int to_cache = 0;
void **pslot;
sector_t tree_index;
int ret;
uintptr_t refcount;
BUG_ON(!r5c_is_writeback(log));
if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
/*
* There are two different scenarios here:
* 1. The stripe has some data cached, and it is sent to
* write-out phase for reclaim
* 2. The stripe is clean, and this is the first write
*
* For 1, return -EAGAIN, so we continue with
* handle_stripe_dirtying().
*
* For 2, set STRIPE_R5C_CACHING and continue with caching
* write.
*/
/* case 1: anything injournal or anything in written */
if (s->injournal > 0 || s->written > 0)
return -EAGAIN;
/* case 2 */
set_bit(STRIPE_R5C_CACHING, &sh->state);
}
/*
* When run in degraded mode, array is set to write-through mode.
* This check helps drain pending write safely in the transition to
* write-through mode.
*
* When a stripe is syncing, the write is also handled in write
* through mode.
*/
if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
r5c_make_stripe_write_out(sh);
return -EAGAIN;
}
for (i = disks; i--; ) {
dev = &sh->dev[i];
/* if non-overwrite, use writing-out phase */
if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
!test_bit(R5_InJournal, &dev->flags)) {
r5c_make_stripe_write_out(sh);
return -EAGAIN;
}
}
/* if the stripe is not counted in big_stripe_tree, add it now */
if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
!test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
tree_index = r5c_tree_index(conf, sh->sector);
spin_lock(&log->tree_lock);
pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
tree_index);
if (pslot) {
refcount = (uintptr_t)radix_tree_deref_slot_protected(
pslot, &log->tree_lock) >>
R5C_RADIX_COUNT_SHIFT;
radix_tree_replace_slot(
&log->big_stripe_tree, pslot,
(void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
} else {
/*
* this radix_tree_insert can fail safely, so no
* need to call radix_tree_preload()
*/
ret = radix_tree_insert(
&log->big_stripe_tree, tree_index,
(void *)(1 << R5C_RADIX_COUNT_SHIFT));
if (ret) {
spin_unlock(&log->tree_lock);
r5c_make_stripe_write_out(sh);
return -EAGAIN;
}
}
spin_unlock(&log->tree_lock);
/*
* set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
* counted in the radix tree
*/
set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
atomic_inc(&conf->r5c_cached_partial_stripes);
}
for (i = disks; i--; ) {
dev = &sh->dev[i];
if (dev->towrite) {
set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_Wantdrain, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
to_cache++;
}
}
if (to_cache) {
set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
/*
* set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
* in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
* r5c_handle_data_cached()
*/
set_bit(STRIPE_LOG_TRAPPED, &sh->state);
}
return 0;
}
/*
* free extra pages (orig_page) we allocated for prexor
*/
void r5c_release_extra_page(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
int i;
bool using_disk_info_extra_page;
using_disk_info_extra_page =
sh->dev[0].orig_page == conf->disks[0].extra_page;
for (i = sh->disks; i--; )
if (sh->dev[i].page != sh->dev[i].orig_page) {
struct page *p = sh->dev[i].orig_page;
sh->dev[i].orig_page = sh->dev[i].page;
clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
if (!using_disk_info_extra_page)
put_page(p);
}
if (using_disk_info_extra_page) {
clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
md_wakeup_thread(conf->mddev->thread);
}
}
void r5c_use_extra_page(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
int i;
struct r5dev *dev;
for (i = sh->disks; i--; ) {
dev = &sh->dev[i];
if (dev->orig_page != dev->page)
put_page(dev->orig_page);
dev->orig_page = conf->disks[i].extra_page;
}
}
/*
* clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
* stripe is committed to RAID disks.
*/
void r5c_finish_stripe_write_out(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s)
{
struct r5l_log *log = conf->log;
int i;
int do_wakeup = 0;
sector_t tree_index;
void **pslot;
uintptr_t refcount;
if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
return;
WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
for (i = sh->disks; i--; ) {
clear_bit(R5_InJournal, &sh->dev[i].flags);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
do_wakeup = 1;
}
/*
* analyse_stripe() runs before r5c_finish_stripe_write_out(),
* We updated R5_InJournal, so we also update s->injournal.
*/
s->injournal = 0;
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
if (atomic_dec_and_test(&conf->pending_full_writes))
md_wakeup_thread(conf->mddev->thread);
if (do_wakeup)
wake_up(&conf->wait_for_overlap);
spin_lock_irq(&log->stripe_in_journal_lock);
list_del_init(&sh->r5c);
spin_unlock_irq(&log->stripe_in_journal_lock);
sh->log_start = MaxSector;
atomic_dec(&log->stripe_in_journal_count);
r5c_update_log_state(log);
/* stop counting this stripe in big_stripe_tree */
if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
tree_index = r5c_tree_index(conf, sh->sector);
spin_lock(&log->tree_lock);
pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
tree_index);
BUG_ON(pslot == NULL);
refcount = (uintptr_t)radix_tree_deref_slot_protected(
pslot, &log->tree_lock) >>
R5C_RADIX_COUNT_SHIFT;
if (refcount == 1)
radix_tree_delete(&log->big_stripe_tree, tree_index);
else
radix_tree_replace_slot(
&log->big_stripe_tree, pslot,
(void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
spin_unlock(&log->tree_lock);
}
if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
atomic_dec(&conf->r5c_flushing_partial_stripes);
atomic_dec(&conf->r5c_cached_partial_stripes);
}
if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
atomic_dec(&conf->r5c_flushing_full_stripes);
atomic_dec(&conf->r5c_cached_full_stripes);
}
r5l_append_flush_payload(log, sh->sector);
/* stripe is flused to raid disks, we can do resync now */
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
set_bit(STRIPE_HANDLE, &sh->state);
}
int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
int pages = 0;
int reserve;
int i;
int ret = 0;
BUG_ON(!log);
for (i = 0; i < sh->disks; i++) {
void *addr;
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
continue;
addr = kmap_atomic(sh->dev[i].page);
sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
addr, PAGE_SIZE);
kunmap_atomic(addr);
pages++;
}
WARN_ON(pages == 0);
/*
* The stripe must enter state machine again to call endio, so
* don't delay.
*/
clear_bit(STRIPE_DELAYED, &sh->state);
atomic_inc(&sh->count);
mutex_lock(&log->io_mutex);
/* meta + data */
reserve = (1 + pages) << (PAGE_SHIFT - 9);
if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
sh->log_start == MaxSector)
r5l_add_no_space_stripe(log, sh);
else if (!r5l_has_free_space(log, reserve)) {
if (sh->log_start == log->last_checkpoint)
BUG();
else
r5l_add_no_space_stripe(log, sh);
} else {
ret = r5l_log_stripe(log, sh, pages, 0);
if (ret) {
spin_lock_irq(&log->io_list_lock);
list_add_tail(&sh->log_list, &log->no_mem_stripes);
spin_unlock_irq(&log->io_list_lock);
}
}
mutex_unlock(&log->io_mutex);
return 0;
}
/* check whether this big stripe is in write back cache. */
bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
{
struct r5l_log *log = conf->log;
sector_t tree_index;
void *slot;
if (!log)
return false;
WARN_ON_ONCE(!rcu_read_lock_held());
tree_index = r5c_tree_index(conf, sect);
slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
return slot != NULL;
}
static int r5l_load_log(struct r5l_log *log)
{
struct md_rdev *rdev = log->rdev;
struct page *page;
struct r5l_meta_block *mb;
sector_t cp = log->rdev->journal_tail;
u32 stored_crc, expected_crc;
bool create_super = false;
int ret = 0;
/* Make sure it's valid */
if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
cp = 0;
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
ret = -EIO;
goto ioerr;
}
mb = page_address(page);
if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
mb->version != R5LOG_VERSION) {
create_super = true;
goto create;
}
stored_crc = le32_to_cpu(mb->checksum);
mb->checksum = 0;
expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
if (stored_crc != expected_crc) {
create_super = true;
goto create;
}
if (le64_to_cpu(mb->position) != cp) {
create_super = true;
goto create;
}
create:
if (create_super) {
log->last_cp_seq = prandom_u32();
cp = 0;
r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
/*
* Make sure super points to correct address. Log might have
* data very soon. If super hasn't correct log tail address,
* recovery can't find the log
*/
r5l_write_super(log, cp);
} else
log->last_cp_seq = le64_to_cpu(mb->seq);
log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
log->max_free_space = RECLAIM_MAX_FREE_SPACE;
log->last_checkpoint = cp;
__free_page(page);
if (create_super) {
log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
log->seq = log->last_cp_seq + 1;
log->next_checkpoint = cp;
} else
ret = r5l_recovery_log(log);
r5c_update_log_state(log);
return ret;
ioerr:
__free_page(page);
return ret;
}
int r5l_start(struct r5l_log *log)
{
int ret;
if (!log)
return 0;
ret = r5l_load_log(log);
if (ret) {
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
r5l_exit_log(conf);
}
return ret;
}
void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
struct r5l_log *log = conf->log;
if (!log)
return;
if ((raid5_calc_degraded(conf) > 0 ||
test_bit(Journal, &rdev->flags)) &&
conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
schedule_work(&log->disable_writeback_work);
}
int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
struct request_queue *q = bdev_get_queue(rdev->bdev);
struct r5l_log *log;
char b[BDEVNAME_SIZE];
int ret;
pr_debug("md/raid:%s: using device %s as journal\n",
mdname(conf->mddev), bdevname(rdev->bdev, b));
if (PAGE_SIZE != 4096)
return -EINVAL;
/*
* The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
* raid_disks r5l_payload_data_parity.
*
* Write journal and cache does not work for very big array
* (raid_disks > 203)
*/
if (sizeof(struct r5l_meta_block) +
((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
conf->raid_disks) > PAGE_SIZE) {
pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
mdname(conf->mddev), conf->raid_disks);
return -EINVAL;
}
log = kzalloc(sizeof(*log), GFP_KERNEL);
if (!log)
return -ENOMEM;
log->rdev = rdev;
log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
sizeof(rdev->mddev->uuid));
mutex_init(&log->io_mutex);
spin_lock_init(&log->io_list_lock);
INIT_LIST_HEAD(&log->running_ios);
INIT_LIST_HEAD(&log->io_end_ios);
INIT_LIST_HEAD(&log->flushing_ios);
INIT_LIST_HEAD(&log->finished_ios);
bio_init(&log->flush_bio, NULL, 0);
log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
if (!log->io_kc)
goto io_kc;
ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc);
if (ret)
goto io_pool;
ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
if (ret)
goto io_bs;
ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0);
if (ret)
goto out_mempool;
spin_lock_init(&log->tree_lock);
INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
log->rdev->mddev, "reclaim");
if (!log->reclaim_thread)
goto reclaim_thread;
log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
init_waitqueue_head(&log->iounit_wait);
INIT_LIST_HEAD(&log->no_mem_stripes);
INIT_LIST_HEAD(&log->no_space_stripes);
spin_lock_init(&log->no_space_stripes_lock);
INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
INIT_LIST_HEAD(&log->stripe_in_journal_list);
spin_lock_init(&log->stripe_in_journal_lock);
atomic_set(&log->stripe_in_journal_count, 0);
rcu_assign_pointer(conf->log, log);
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
reclaim_thread:
mempool_exit(&log->meta_pool);
out_mempool:
bioset_exit(&log->bs);
io_bs:
mempool_exit(&log->io_pool);
io_pool:
kmem_cache_destroy(log->io_kc);
io_kc:
kfree(log);
return -EINVAL;
}
void r5l_exit_log(struct r5conf *conf)
{
struct r5l_log *log = conf->log;
conf->log = NULL;
synchronize_rcu();
/* Ensure disable_writeback_work wakes up and exits */
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
mempool_exit(&log->meta_pool);
bioset_exit(&log->bs);
mempool_exit(&log->io_pool);
kmem_cache_destroy(log->io_kc);
kfree(log);
}