From 03002d0374dd37a1969c6c3aa5e16ffe9844af24 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 10 May 2018 14:56:41 -0700 Subject: [PATCH 01/37] ANDROID: ftrace: fix function type mismatches This change fixes indirect call mismatches with function and function graph tracing, which trip Control-Flow Integrity (CFI) checking. Bug: 79510107 Bug: 67506682 Change-Id: I5de08c113fb970ffefedce93c58e0161f22c7ca2 Signed-off-by: Sami Tolvanen --- include/linux/ftrace.h | 8 ++++++++ kernel/trace/ftrace.c | 17 +++++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 07edd89dca53..ff3e5b109f9c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -254,8 +254,16 @@ static inline int ftrace_function_local_disabled(struct ftrace_ops *ops) return *this_cpu_ptr(ops->disabled); } +#ifdef CONFIG_CFI_CLANG +/* Use a C stub with the correct type for CFI */ +static inline void ftrace_stub(unsigned long a0, unsigned long a1, + struct ftrace_ops *op, struct pt_regs *regs) +{ +} +#else extern void ftrace_stub(unsigned long a0, unsigned long a1, struct ftrace_ops *op, struct pt_regs *regs); +#endif #else /* !CONFIG_FUNCTION_TRACER */ /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c52b8e4e173a..e69d28e80ae4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -122,8 +122,9 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs); #else /* See comment below, where ftrace_ops_list_func is defined */ -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); -#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); +#define ftrace_ops_list_func ftrace_ops_no_ops #endif /* @@ -6098,7 +6099,8 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, __ftrace_ops_list_func(ip, parent_ip, NULL, regs); } #else -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } @@ -6562,14 +6564,17 @@ void ftrace_graph_graph_time_control(bool enable) fgraph_graph_time = enable; } +void ftrace_graph_return_stub(struct ftrace_graph_ret *trace) +{ +} + int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { return 0; } /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_return_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -6797,7 +6802,7 @@ void unregister_ftrace_graph(void) goto out; ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_return = ftrace_graph_return_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); From 0dafb9f618ddd207e1a481f2ba73c7ebd0f42a8a Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Fri, 20 Jul 2018 09:52:12 -0700 Subject: [PATCH 02/37] ANDROID: remove android config fragments The authoritative versions of the android kconfig fragments were moved into a separate repository some time back: https://android.googlesource.com/kernel/configs/ Change-Id: I1fc770b4f040de983c6b7a041502aef864c86cff Signed-off-by: Steve Muckle --- kernel/configs/android-base.config | 161 ---------------------- kernel/configs/android-recommended.config | 129 ----------------- 2 files changed, 290 deletions(-) delete mode 100644 kernel/configs/android-base.config delete mode 100644 kernel/configs/android-recommended.config diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config deleted file mode 100644 index d3fd428f4b92..000000000000 --- a/kernel/configs/android-base.config +++ /dev/null @@ -1,161 +0,0 @@ -# KEEP ALPHABETICALLY SORTED -# CONFIG_DEVKMEM is not set -# CONFIG_DEVMEM is not set -# CONFIG_FHANDLE is not set -# CONFIG_INET_LRO is not set -# CONFIG_NFSD is not set -# CONFIG_NFS_FS is not set -# CONFIG_OABI_COMPAT is not set -# CONFIG_SYSVIPC is not set -# CONFIG_USELIB is not set -CONFIG_ANDROID=y -CONFIG_ANDROID_BINDER_IPC=y -CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder -CONFIG_ANDROID_LOW_MEMORY_KILLER=y -CONFIG_ARMV8_DEPRECATED=y -CONFIG_ASHMEM=y -CONFIG_AUDIT=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_CGROUPS=y -CONFIG_CGROUP_BPF=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_DEBUG=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_SCHED=y -CONFIG_CP15_BARRIER_EMULATION=y -CONFIG_DEFAULT_SECURITY_SELINUX=y -CONFIG_EMBEDDED=y -CONFIG_FB=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_INET6_AH=y -CONFIG_INET6_ESP=y -CONFIG_INET6_IPCOMP=y -CONFIG_INET=y -CONFIG_INET_DIAG_DESTROY=y -CONFIG_INET_ESP=y -CONFIG_INET_XFRM_MODE_TUNNEL=y -CONFIG_IP6_NF_FILTER=y -CONFIG_IP6_NF_IPTABLES=y -CONFIG_IP6_NF_MANGLE=y -CONFIG_IP6_NF_RAW=y -CONFIG_IP6_NF_TARGET_REJECT=y -CONFIG_IPV6=y -CONFIG_IPV6_MIP6=y -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_ARPFILTER=y -CONFIG_IP_NF_ARPTABLES=y -CONFIG_IP_NF_ARP_MANGLE=y -CONFIG_IP_NF_FILTER=y -CONFIG_IP_NF_IPTABLES=y -CONFIG_IP_NF_MANGLE=y -CONFIG_IP_NF_MATCH_AH=y -CONFIG_IP_NF_MATCH_ECN=y -CONFIG_IP_NF_MATCH_TTL=y -CONFIG_IP_NF_NAT=y -CONFIG_IP_NF_RAW=y -CONFIG_IP_NF_SECURITY=y -CONFIG_IP_NF_TARGET_MASQUERADE=y -CONFIG_IP_NF_TARGET_NETMAP=y -CONFIG_IP_NF_TARGET_REDIRECT=y -CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_NET=y -CONFIG_NETDEVICES=y -CONFIG_NETFILTER=y -CONFIG_NETFILTER_TPROXY=y -CONFIG_NETFILTER_XT_MATCH_COMMENT=y -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y -CONFIG_NETFILTER_XT_MATCH_CONNMARK=y -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y -CONFIG_NETFILTER_XT_MATCH_HELPER=y -CONFIG_NETFILTER_XT_MATCH_IPRANGE=y -CONFIG_NETFILTER_XT_MATCH_LENGTH=y -CONFIG_NETFILTER_XT_MATCH_LIMIT=y -CONFIG_NETFILTER_XT_MATCH_MAC=y -CONFIG_NETFILTER_XT_MATCH_MARK=y -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y -CONFIG_NETFILTER_XT_MATCH_POLICY=y -CONFIG_NETFILTER_XT_MATCH_QUOTA=y -CONFIG_NETFILTER_XT_MATCH_SOCKET=y -CONFIG_NETFILTER_XT_MATCH_STATE=y -CONFIG_NETFILTER_XT_MATCH_STATISTIC=y -CONFIG_NETFILTER_XT_MATCH_STRING=y -CONFIG_NETFILTER_XT_MATCH_TIME=y -CONFIG_NETFILTER_XT_MATCH_U32=y -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y -CONFIG_NETFILTER_XT_TARGET_CONNMARK=y -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y -CONFIG_NETFILTER_XT_TARGET_MARK=y -CONFIG_NETFILTER_XT_TARGET_NFLOG=y -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y -CONFIG_NETFILTER_XT_TARGET_SECMARK=y -CONFIG_NETFILTER_XT_TARGET_TCPMSS=y -CONFIG_NETFILTER_XT_TARGET_TPROXY=y -CONFIG_NETFILTER_XT_TARGET_TRACE=y -CONFIG_NET_CLS_ACT=y -CONFIG_NET_CLS_U32=y -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_U32=y -CONFIG_NET_KEY=y -CONFIG_NET_SCHED=y -CONFIG_NET_SCH_HTB=y -CONFIG_NF_CONNTRACK=y -CONFIG_NF_CONNTRACK_AMANDA=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_FTP=y -CONFIG_NF_CONNTRACK_H323=y -CONFIG_NF_CONNTRACK_IPV4=y -CONFIG_NF_CONNTRACK_IPV6=y -CONFIG_NF_CONNTRACK_IRC=y -CONFIG_NF_CONNTRACK_NETBIOS_NS=y -CONFIG_NF_CONNTRACK_PPTP=y -CONFIG_NF_CONNTRACK_SANE=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_TFTP=y -CONFIG_NF_CT_NETLINK=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_NAT=y -CONFIG_NO_HZ=y -CONFIG_PACKET=y -CONFIG_PM_AUTOSLEEP=y -CONFIG_PM_WAKELOCKS=y -CONFIG_PPP=y -CONFIG_PPP_BSDCOMP=y -CONFIG_PPP_DEFLATE=y -CONFIG_PPP_MPPE=y -CONFIG_PREEMPT=y -CONFIG_QUOTA=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_RTC_CLASS=y -CONFIG_RT_GROUP_SCHED=y -CONFIG_SECCOMP=y -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_SELINUX=y -CONFIG_SETEND_EMULATION=y -CONFIG_STAGING=y -CONFIG_SWP_EMULATION=y -CONFIG_SYNC=y -CONFIG_TUN=y -CONFIG_UNIX=y -CONFIG_USB_GADGET=y -CONFIG_USB_CONFIGFS=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_MIDI=y -CONFIG_USB_OTG_WAKELOCK=y -CONFIG_XFRM_USER=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config deleted file mode 100644 index 946fb92418f7..000000000000 --- a/kernel/configs/android-recommended.config +++ /dev/null @@ -1,129 +0,0 @@ -# KEEP ALPHABETICALLY SORTED -# CONFIG_AIO is not set -# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_LEGACY_PTYS is not set -# CONFIG_NF_CONNTRACK_SIP is not set -# CONFIG_PM_WAKELOCKS_GC is not set -# CONFIG_VT is not set -CONFIG_ARM64_SW_TTBR0_PAN=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y -CONFIG_BLK_DEV_DM=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=8192 -CONFIG_CC_STACKPROTECTOR_STRONG=y -CONFIG_COMPACTION=y -CONFIG_CPU_SW_DOMAIN_PAN=y -CONFIG_DM_CRYPT=y -CONFIG_DM_UEVENT=y -CONFIG_DM_VERITY=y -CONFIG_DM_VERITY_FEC=y -CONFIG_DRAGONRISE_FF=y -CONFIG_ENABLE_DEFAULT_TRACERS=y -CONFIG_EXT4_FS=y -CONFIG_EXT4_FS_SECURITY=y -CONFIG_FUSE_FS=y -CONFIG_GREENASIA_FF=y -CONFIG_HIDRAW=y -CONFIG_HID_A4TECH=y -CONFIG_HID_ACRUX=y -CONFIG_HID_ACRUX_FF=y -CONFIG_HID_APPLE=y -CONFIG_HID_BELKIN=y -CONFIG_HID_CHERRY=y -CONFIG_HID_CHICONY=y -CONFIG_HID_CYPRESS=y -CONFIG_HID_DRAGONRISE=y -CONFIG_HID_ELECOM=y -CONFIG_HID_EMS_FF=y -CONFIG_HID_EZKEY=y -CONFIG_HID_GREENASIA=y -CONFIG_HID_GYRATION=y -CONFIG_HID_HOLTEK=y -CONFIG_HID_KENSINGTON=y -CONFIG_HID_KEYTOUCH=y -CONFIG_HID_KYE=y -CONFIG_HID_LCPOWER=y -CONFIG_HID_LOGITECH=y -CONFIG_HID_LOGITECH_DJ=y -CONFIG_HID_MAGICMOUSE=y -CONFIG_HID_MICROSOFT=y -CONFIG_HID_MONTEREY=y -CONFIG_HID_MULTITOUCH=y -CONFIG_HID_NTRIG=y -CONFIG_HID_ORTEK=y -CONFIG_HID_PANTHERLORD=y -CONFIG_HID_PETALYNX=y -CONFIG_HID_PICOLCD=y -CONFIG_HID_PRIMAX=y -CONFIG_HID_PRODIKEYS=y -CONFIG_HID_ROCCAT=y -CONFIG_HID_SAITEK=y -CONFIG_HID_SAMSUNG=y -CONFIG_HID_SMARTJOYPLUS=y -CONFIG_HID_SONY=y -CONFIG_HID_SPEEDLINK=y -CONFIG_HID_SUNPLUS=y -CONFIG_HID_THRUSTMASTER=y -CONFIG_HID_TIVO=y -CONFIG_HID_TOPSEED=y -CONFIG_HID_TWINHAN=y -CONFIG_HID_UCLOGIC=y -CONFIG_HID_WACOM=y -CONFIG_HID_WALTOP=y -CONFIG_HID_WIIMOTE=y -CONFIG_HID_ZEROPLUS=y -CONFIG_HID_ZYDACRON=y -CONFIG_INPUT_EVDEV=y -CONFIG_INPUT_GPIO=y -CONFIG_INPUT_JOYSTICK=y -CONFIG_INPUT_MISC=y -CONFIG_INPUT_TABLET=y -CONFIG_INPUT_UINPUT=y -CONFIG_ION=y -CONFIG_JOYSTICK_XPAD=y -CONFIG_JOYSTICK_XPAD_FF=y -CONFIG_JOYSTICK_XPAD_LEDS=y -CONFIG_KALLSYMS_ALL=y -CONFIG_KSM=y -CONFIG_LOGIG940_FF=y -CONFIG_LOGIRUMBLEPAD2_FF=y -CONFIG_LOGITECH_FF=y -CONFIG_MD=y -CONFIG_MEDIA_SUPPORT=y -CONFIG_MSDOS_FS=y -CONFIG_PANIC_TIMEOUT=5 -CONFIG_PANTHERLORD_FF=y -CONFIG_PERF_EVENTS=y -CONFIG_PM_DEBUG=y -CONFIG_PM_RUNTIME=y -CONFIG_PM_WAKELOCKS_LIMIT=0 -CONFIG_POWER_SUPPLY=y -CONFIG_PSTORE=y -CONFIG_PSTORE_CONSOLE=y -CONFIG_PSTORE_RAM=y -CONFIG_SCHEDSTATS=y -CONFIG_SMARTJOYPLUS_FF=y -CONFIG_SND=y -CONFIG_SOUND=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_SUSPEND_TIME=y -CONFIG_TABLET_USB_ACECAD=y -CONFIG_TABLET_USB_AIPTEK=y -CONFIG_TABLET_USB_GTCO=y -CONFIG_TABLET_USB_HANWANG=y -CONFIG_TABLET_USB_KBTAB=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_TASK_XACCT=y -CONFIG_TIMER_STATS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_UHID=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -CONFIG_USB_EHCI_HCD=y -CONFIG_USB_HIDDEV=y -CONFIG_USB_USBNET=y -CONFIG_VFAT_FS=y From c37a593cdbbf5ef7700a5c9c24d1fc78eb473d19 Mon Sep 17 00:00:00 2001 From: David Zeuthen Date: Tue, 24 Jan 2017 13:17:01 -0500 Subject: [PATCH 03/37] ANDROID: AVB error handler to invalidate vbmeta partition. If androidboot.vbmeta.invalidate_on_error is 'yes' and androidboot.vbmeta.device is set and points to a device with vbmeta magic, this header will be overwritten upon an irrecoverable dm-verity error. The side-effect of this is that the slot will fail to verify on next reboot, effectively triggering the boot loader to fallback to another slot. This work both if the vbmeta struct is at the start of a partition or if there's an AVB footer at the end. This code is based on drivers/md/dm-verity-chromeos.c from ChromiumOS. Bug: 31622239 Test: Manually tested (other arch). Change-Id: I571b5a75461da38ad832a9bea33c298bef859e26 Signed-off-by: David Zeuthen --- drivers/md/Kconfig | 9 ++ drivers/md/Makefile | 4 + drivers/md/dm-verity-avb.c | 229 ++++++++++++++++++++++++++++++++++ drivers/md/dm-verity-target.c | 6 +- drivers/md/dm-verity.h | 1 + 5 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 drivers/md/dm-verity-avb.c diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index d87c4725ed5b..ec359e16525b 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -555,6 +555,15 @@ config DM_ZONED If unsure, say N. +config DM_VERITY_AVB + tristate "Support AVB specific verity error behavior" + depends on DM_VERITY + ---help--- + Enables Android Verified Boot platform-specific error + behavior. In particular, it will modify the vbmeta partition + specified on the kernel command-line when non-transient error + occurs (followed by a panic). + config DM_ANDROID_VERITY bool "Android verity target support" depends on BLK_DEV_DM=y diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 83109ad30a48..f98717ffff9e 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -72,6 +72,10 @@ ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif +ifeq ($(CONFIG_DM_VERITY_AVB),y) +dm-verity-objs += dm-verity-avb.o +endif + ifeq ($(CONFIG_DM_ANDROID_VERITY),y) dm-verity-objs += dm-android-verity.o endif diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c new file mode 100644 index 000000000000..a9f102aa379e --- /dev/null +++ b/drivers/md/dm-verity-avb.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2017 Google. + * + * This file is released under the GPLv2. + * + * Based on drivers/md/dm-verity-chromeos.c + */ + +#include +#include +#include + +#define DM_MSG_PREFIX "verity-avb" + +/* Set via module parameters. */ +static char avb_vbmeta_device[64]; +static char avb_invalidate_on_error[4]; + +static void invalidate_vbmeta_endio(struct bio *bio) +{ + if (bio->bi_status) + DMERR("invalidate_vbmeta_endio: error %d", bio->bi_status); + complete(bio->bi_private); +} + +static int invalidate_vbmeta_submit(struct bio *bio, + struct block_device *bdev, + int op, int access_last_sector, + struct page *page) +{ + DECLARE_COMPLETION_ONSTACK(wait); + + bio->bi_private = &wait; + bio->bi_end_io = invalidate_vbmeta_endio; + bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, op, REQ_SYNC); + + bio->bi_iter.bi_sector = 0; + if (access_last_sector) { + sector_t last_sector; + + last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1; + bio->bi_iter.bi_sector = last_sector; + } + if (!bio_add_page(bio, page, PAGE_SIZE, 0)) { + DMERR("invalidate_vbmeta_submit: bio_add_page error"); + return -EIO; + } + + submit_bio(bio); + /* Wait up to 2 seconds for completion or fail. */ + if (!wait_for_completion_timeout(&wait, msecs_to_jiffies(2000))) + return -EIO; + return 0; +} + +static int invalidate_vbmeta(dev_t vbmeta_devt) +{ + int ret = 0; + struct block_device *bdev; + struct bio *bio; + struct page *page; + fmode_t dev_mode; + /* Ensure we do synchronous unblocked I/O. We may also need + * sync_bdev() on completion, but it really shouldn't. + */ + int access_last_sector = 0; + + DMINFO("invalidate_vbmeta: acting on device %d:%d", + MAJOR(vbmeta_devt), MINOR(vbmeta_devt)); + + /* First we open the device for reading. */ + dev_mode = FMODE_READ | FMODE_EXCL; + bdev = blkdev_get_by_dev(vbmeta_devt, dev_mode, + invalidate_vbmeta); + if (IS_ERR(bdev)) { + DMERR("invalidate_kernel: could not open device for reading"); + dev_mode = 0; + ret = -ENOENT; + goto failed_to_read; + } + + bio = bio_alloc(GFP_NOIO, 1); + if (!bio) { + ret = -ENOMEM; + goto failed_bio_alloc; + } + + page = alloc_page(GFP_NOIO); + if (!page) { + ret = -ENOMEM; + goto failed_to_alloc_page; + } + + access_last_sector = 0; + ret = invalidate_vbmeta_submit(bio, bdev, REQ_OP_READ, + access_last_sector, page); + if (ret) { + DMERR("invalidate_vbmeta: error reading"); + goto failed_to_submit_read; + } + + /* We have a page. Let's make sure it looks right. */ + if (memcmp("AVB0", page_address(page), 4) == 0) { + /* Stamp it. */ + memcpy(page_address(page), "AVE0", 4); + DMINFO("invalidate_vbmeta: found vbmeta partition"); + } else { + /* Could be this is on a AVB footer, check. Also, since the + * AVB footer is in the last 64 bytes, adjust for the fact that + * we're dealing with 512-byte sectors. + */ + size_t offset = (1<bi_remaining. + */ + bio_reset(bio); + + ret = invalidate_vbmeta_submit(bio, bdev, REQ_OP_WRITE, + access_last_sector, page); + if (ret) { + DMERR("invalidate_vbmeta: error writing"); + goto failed_to_submit_write; + } + + DMERR("invalidate_vbmeta: completed."); + ret = 0; +failed_to_submit_write: +failed_to_write: +invalid_header: + __free_page(page); +failed_to_submit_read: + /* Technically, we'll leak a page with the pending bio, but + * we're about to reboot anyway. + */ +failed_to_alloc_page: + bio_put(bio); +failed_bio_alloc: + if (dev_mode) + blkdev_put(bdev, dev_mode); +failed_to_read: + return ret; +} + +void dm_verity_avb_error_handler(void) +{ + dev_t dev; + + DMINFO("AVB error handler called for %s", avb_vbmeta_device); + + if (strcmp(avb_invalidate_on_error, "yes") != 0) { + DMINFO("Not configured to invalidate"); + return; + } + + if (avb_vbmeta_device[0] == '\0') { + DMERR("avb_vbmeta_device parameter not set"); + goto fail_no_dev; + } + + dev = name_to_dev_t(avb_vbmeta_device); + if (!dev) { + DMERR("No matching partition for device: %s", + avb_vbmeta_device); + goto fail_no_dev; + } + + invalidate_vbmeta(dev); + +fail_no_dev: + ; +} + +static int __init dm_verity_avb_init(void) +{ + DMINFO("AVB error handler initialized with vbmeta device: %s", + avb_vbmeta_device); + return 0; +} + +static void __exit dm_verity_avb_exit(void) +{ +} + +module_init(dm_verity_avb_init); +module_exit(dm_verity_avb_exit); + +MODULE_AUTHOR("David Zeuthen "); +MODULE_DESCRIPTION("AVB-specific error handler for dm-verity"); +MODULE_LICENSE("GPL"); + +/* Declare parameter with no module prefix */ +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "androidboot.vbmeta." +module_param_string(device, avb_vbmeta_device, sizeof(avb_vbmeta_device), 0); +module_param_string(invalidate_on_error, avb_invalidate_on_error, + sizeof(avb_invalidate_on_error), 0); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index ecb506b6923d..be1e88881122 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -276,8 +276,12 @@ out: if (v->mode == DM_VERITY_MODE_LOGGING) return 0; - if (v->mode == DM_VERITY_MODE_RESTART) + if (v->mode == DM_VERITY_MODE_RESTART) { +#ifdef CONFIG_DM_VERITY_AVB + dm_verity_avb_error_handler(); +#endif kernel_restart("dm-verity device corrupted"); + } return 1; } diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index e80e06aa5ec6..116e91fa50d7 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -142,4 +142,5 @@ extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits); extern void verity_dtr(struct dm_target *ti); extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv); extern int verity_map(struct dm_target *ti, struct bio *bio); +extern void dm_verity_avb_error_handler(void); #endif /* DM_VERITY_H */ From 9001b8f1a599b974ca8f2f925c210a31c0c70d60 Mon Sep 17 00:00:00 2001 From: Puja Gupta Date: Fri, 11 May 2018 14:05:37 -0700 Subject: [PATCH 04/37] ANDROID: sched: Add support for frequency/power energy model This patch provides support to pass frequency/power values from energy model instead of capacity/power. It is required to support multiple speed bin devices i.e. devices which have different max frequency based on hardware. User can provide all supported frequencies across all versions of the device in device tree energy model. Max frequency of the cpu is read at runtime from OPP driver and the capacity values are populated using that. Energy model proc file will display cap_states as a tuple of (capacity, frequency, power), in which frequency will be 0 when not using frequency energy model. Change-Id: I25aec1f91aa8f09bcf67f0575792693808847618 Signed-off-by: Puja Gupta --- .../bindings/scheduler/sched-energy-costs.txt | 5 + include/linux/sched/topology.h | 2 + kernel/sched/energy.c | 176 +++++++++++++++++- kernel/sched/fair.c | 2 +- 4 files changed, 178 insertions(+), 7 deletions(-) diff --git a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt index 2ceb202462cb..447cc32b3f21 100644 --- a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt +++ b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt @@ -237,6 +237,11 @@ Property added to the cpu node: Any other configuration is invalid. +- freq-energy-model + Description: Optional. Must be declared if the energy model + represents frequency/power values. If absent, energy model is + by default considered as capacity/power. + =========================================================== 5 - Example dts =========================================================== diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index e0161c3da0da..5fcb7dd36048 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -69,6 +69,7 @@ extern int sched_domain_level_max; struct capacity_state { unsigned long cap; /* compute capacity */ + unsigned long frequency;/* frequency */ unsigned long power; /* power consumption at this compute capacity */ }; @@ -194,6 +195,7 @@ typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); typedef const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu); +extern bool energy_aware(void); #define SDTL_OVERLAP 0x01 diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c index be470c695293..8342de041e11 100644 --- a/kernel/sched/energy.c +++ b/kernel/sched/energy.c @@ -18,8 +18,6 @@ */ #define pr_fmt(fmt) "sched-energy: " fmt -#define DEBUG - #include #include #include @@ -28,6 +26,11 @@ #include #include #include +#include +#include +#include + +#include "sched.h" struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; @@ -47,6 +50,8 @@ static void free_resources(void) } } } +static bool sge_ready; +static bool freq_energy_model; void check_max_cap_vs_cpu_scale(int cpu, struct sched_group_energy *sge) { @@ -83,6 +88,9 @@ void init_sched_energy_costs(void) pr_warn("CPU device node has no sched-energy-costs\n"); return; } + /* Check if the energy model contains frequency/power values */ + if (of_find_property(cn, "freq-energy-model", NULL)) + freq_energy_model = true; for_each_possible_sd_level(sd_level) { cp = of_parse_phandle(cn, "sched-energy-costs", sd_level); @@ -104,7 +112,18 @@ void init_sched_energy_costs(void) GFP_NOWAIT); for (i = 0, val = prop->value; i < nstates; i++) { - cap_states[i].cap = be32_to_cpup(val++); + if (freq_energy_model) { + /* + * Capacity values will be calculated later using + * frequency reported by OPP driver and cpu_uarch_scale + * values. + */ + cap_states[i].frequency = be32_to_cpup(val++); + cap_states[i].cap = 0; + } else { + cap_states[i].frequency = 0; + cap_states[i].cap = be32_to_cpup(val++); + } cap_states[i].power = be32_to_cpup(val++); } @@ -130,13 +149,158 @@ void init_sched_energy_costs(void) sge_array[cpu][sd_level] = sge; } - - check_max_cap_vs_cpu_scale(cpu, sge_array[cpu][SD_LEVEL0]); + if (!freq_energy_model) + check_max_cap_vs_cpu_scale(cpu, sge_array[cpu][SD_LEVEL0]); } - + sge_ready = true; pr_info("Sched-energy-costs installed from DT\n"); return; out: free_resources(); } + +static int sched_energy_probe(struct platform_device *pdev) +{ + int cpu; + unsigned long *max_frequencies = NULL; + int ret; + + if (!sge_ready) + return -EPROBE_DEFER; + + if (!energy_aware() || !freq_energy_model) + return 0; + + max_frequencies = kmalloc_array(nr_cpu_ids, sizeof(unsigned long), + GFP_KERNEL); + if (!max_frequencies) { + ret = -ENOMEM; + goto exit; + } + + /* + * Find system max possible frequency and max frequencies for each + * CPUs. + */ + for_each_possible_cpu(cpu) { + struct device *cpu_dev; + struct dev_pm_opp *opp; + + cpu_dev = get_cpu_device(cpu); + if (IS_ERR_OR_NULL(cpu_dev)) { + if (!cpu_dev) + ret = -EINVAL; + else + ret = PTR_ERR(cpu_dev); + goto exit; + } + + max_frequencies[cpu] = ULONG_MAX; + + opp = dev_pm_opp_find_freq_floor(cpu_dev, + &max_frequencies[cpu]); + if (IS_ERR_OR_NULL(opp)) { + if (!opp || PTR_ERR(opp) == -ENODEV) + ret = -EPROBE_DEFER; + else + ret = PTR_ERR(opp); + goto exit; + } + + /* Convert HZ to KHZ */ + max_frequencies[cpu] /= 1000; + } + + /* update capacity in energy model */ + for_each_possible_cpu(cpu) { + unsigned long cpu_max_cap; + struct sched_group_energy *sge_l0, *sge; + cpu_max_cap = topology_get_cpu_scale(NULL, cpu); + + /* + * All the cap_states have same frequency table so use + * SD_LEVEL0's. + */ + sge_l0 = sge_array[cpu][SD_LEVEL0]; + if (sge_l0 && sge_l0->nr_cap_states > 0) { + int i; + int ncapstates = sge_l0->nr_cap_states; + + for (i = 0; i < ncapstates; i++) { + int sd_level; + unsigned long freq, cap; + + /* + * Energy model can contain more frequency + * steps than actual for multiple speedbin + * support. Ceil the max capacity with actual + * one. + */ + freq = min(sge_l0->cap_states[i].frequency, + max_frequencies[cpu]); + cap = DIV_ROUND_UP(cpu_max_cap * freq, + max_frequencies[cpu]); + + for_each_possible_sd_level(sd_level) { + sge = sge_array[cpu][sd_level]; + if (!sge) + break; + sge->cap_states[i].cap = cap; + } + dev_dbg(&pdev->dev, + "cpu=%d freq=%ld cap=%ld power_d0=%ld\n", + cpu, freq, sge_l0->cap_states[i].cap, + sge_l0->cap_states[i].power); + } + + dev_info(&pdev->dev, + "cpu=%d [freq=%ld cap=%ld power_d0=%ld] -> [freq=%ld cap=%ld power_d0=%ld]\n", + cpu, + sge_l0->cap_states[0].frequency, + sge_l0->cap_states[0].cap, + sge_l0->cap_states[0].power, + sge_l0->cap_states[ncapstates - 1].frequency, + sge_l0->cap_states[ncapstates - 1].cap, + sge_l0->cap_states[ncapstates - 1].power + ); + } + } + + kfree(max_frequencies); + dev_info(&pdev->dev, "Sched-energy-costs capacity updated\n"); + return 0; + +exit: + if (ret != -EPROBE_DEFER) + dev_err(&pdev->dev, "error=%d\n", ret); + kfree(max_frequencies); + return ret; +} + +static struct platform_driver energy_driver = { + .driver = { + .name = "sched-energy", + }, + .probe = sched_energy_probe, +}; + +static struct platform_device energy_device = { + .name = "sched-energy", +}; + +static int __init sched_energy_init(void) +{ + int ret; + + ret = platform_device_register(&energy_device); + if (ret) + pr_err("%s device_register failed:%d\n", __func__, ret); + ret = platform_driver_register(&energy_driver); + if (ret) { + pr_err("%s driver_register failed:%d\n", __func__, ret); + platform_device_unregister(&energy_device); + } + return ret; +} +subsys_initcall(sched_energy_init); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4816cd549cde..d10b39326d30 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5686,7 +5686,7 @@ unsigned long capacity_curr_of(int cpu) return cap_scale(max_cap, scale_freq); } -static inline bool energy_aware(void) +inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); } From 727eafb4a89d381f79063f0a988399fe717bc57b Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 2 Apr 2018 11:37:51 -0700 Subject: [PATCH 05/37] ANDROID: sched/fair: prevent possible infinite loop in sched_group_energy There is a race between hotplug and energy_diff which might result in endless loop in sched_group_energy. When this happens, the end condition cannot be detected. We can store how many CPUs we need to visit at the beginning, and bail out of the energy calculation if we visit more cpus than expected. Bug: 72311797 72202633 Change-Id: I01c5a5ee9b9712903cb321d3b5ccf2539205d55d Signed-off-by: Chris Redpath Signed-off-by: Puja Gupta --- kernel/sched/fair.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d10b39326d30..6d792b4cd645 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6236,11 +6236,21 @@ static int compute_energy(struct energy_env *eenv) int cpu; struct cpumask visit_cpus; struct sched_group *sg; + int cpu_count; WARN_ON(!eenv->sg_top->sge); cpumask_copy(&visit_cpus, sched_group_span(eenv->sg_top)); - + /* If a cpu is hotplugged in while we are in this function, it does + * not appear in the existing visit_cpus mask which came from the + * sched_group pointer of the sched_domain pointed at by sd_ea for + * either the prev or next cpu and was dereferenced in + * select_energy_cpu_idx. + * Since we will dereference sd_scs later as we iterate through the + * CPUs we expect to visit, new CPUs can be present which are not in + * the visit_cpus mask. Guard this with cpu_count. + */ + cpu_count = cpumask_weight(&visit_cpus); while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; @@ -6249,6 +6259,8 @@ static int compute_energy(struct energy_env *eenv) /* * Is the group utilization affected by cpus outside this * sched_group? + * This sd may have groups with cpus which were not present + * when we took visit_cpus. */ sd = rcu_dereference(per_cpu(sd_scs, cpu)); if (sd && sd->parent) @@ -6274,8 +6286,24 @@ static int compute_energy(struct energy_env *eenv) calc_sg_energy(eenv); /* remove CPUs we have just visited */ - if (!sd->child) + if (!sd->child) { + /* + * cpu_count here is the number of + * cpus we expect to visit in this + * calculation. If we race against + * hotplug, we can have extra cpus + * added to the groups we are + * iterating which do not appear in + * the visit_cpus mask. In that case + * we are not able to calculate energy + * without restarting so we will bail + * out and use prev_cpu this time. + */ + if (!cpu_count) + return -EINVAL; cpumask_xor(&visit_cpus, &visit_cpus, sched_group_span(sg)); + cpu_count--; + } if (cpumask_equal(sched_group_span(sg), sched_group_span(eenv->sg_top))) goto next_cpu; From 3f5daa54345bfdf55e38f3c8598bb69304891b33 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 29 Mar 2018 13:35:50 -0700 Subject: [PATCH 06/37] ANDROID: FIXUP: sched/fair: Fix hang during suspend in compute_energy BUG: 29353986 Change-Id: I0ad295fcb9134ee964cdd764e9038db3c25d91ba Signed-off-by: Andres Oportus Signed-off-by: Puja Gupta --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6d792b4cd645..bf2b794dfc06 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6311,6 +6311,7 @@ static int compute_energy(struct energy_env *eenv) } while (sg = sg->next, sg != sd->groups); } next_cpu: + cpumask_clear_cpu(cpu, &visit_cpus); continue; } From 47112d6b55eb88eb292ca00943bd86493b303f1c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Nov 2017 17:33:49 -0800 Subject: [PATCH 07/37] UPSTREAM: zram: add zstd to the supported algorithms list Add ZSTD to the list of supported compression algorithms. ZRAM fio perf test: LZO DEFLATE ZSTD WRITE: (2180MB/s) (77.2MB/s) (1429MB/s) WRITE: (1617MB/s) (77.7MB/s) (1202MB/s) READ: (426MB/s) (595MB/s) (1181MB/s) READ: (422MB/s) (572MB/s) (1020MB/s) READ: (318MB/s) (67.8MB/s) (563MB/s) WRITE: (318MB/s) (67.9MB/s) (564MB/s) READ: (336MB/s) (68.3MB/s) (583MB/s) WRITE: (335MB/s) (68.2MB/s) (582MB/s) WRITE: (3441MB/s) (152MB/s) (2141MB/s) WRITE: (2507MB/s) (147MB/s) (1888MB/s) READ: (801MB/s) (1146MB/s) (1890MB/s) READ: (767MB/s) (1096MB/s) (2073MB/s) READ: (621MB/s) (126MB/s) (1009MB/s) WRITE: (621MB/s) (126MB/s) (1009MB/s) READ: (656MB/s) (125MB/s) (1075MB/s) WRITE: (657MB/s) (126MB/s) (1077MB/s) WRITE: (4772MB/s) (225MB/s) (3394MB/s) WRITE: (3905MB/s) (211MB/s) (2939MB/s) READ: (1216MB/s) (1608MB/s) (3218MB/s) READ: (1159MB/s) (1431MB/s) (2981MB/s) READ: (906MB/s) (156MB/s) (1457MB/s) WRITE: (907MB/s) (156MB/s) (1458MB/s) READ: (953MB/s) (158MB/s) (1595MB/s) WRITE: (952MB/s) (157MB/s) (1593MB/s) WRITE: (6036MB/s) (265MB/s) (4469MB/s) WRITE: (5059MB/s) (263MB/s) (3951MB/s) READ: (1618MB/s) (2066MB/s) (4276MB/s) READ: (1573MB/s) (1942MB/s) (3830MB/s) READ: (1202MB/s) (227MB/s) (1971MB/s) WRITE: (1200MB/s) (227MB/s) (1968MB/s) READ: (1265MB/s) (226MB/s) (2116MB/s) WRITE: (1264MB/s) (226MB/s) (2114MB/s) WRITE: (5339MB/s) (233MB/s) (3781MB/s) WRITE: (4298MB/s) (234MB/s) (3276MB/s) READ: (1626MB/s) (2048MB/s) (4081MB/s) READ: (1567MB/s) (1929MB/s) (3758MB/s) READ: (1174MB/s) (205MB/s) (1747MB/s) WRITE: (1173MB/s) (204MB/s) (1746MB/s) READ: (1214MB/s) (208MB/s) (1890MB/s) WRITE: (1215MB/s) (208MB/s) (1892MB/s) WRITE: (5666MB/s) (270MB/s) (4338MB/s) WRITE: (4828MB/s) (267MB/s) (3772MB/s) READ: (1803MB/s) (2058MB/s) (4946MB/s) READ: (1805MB/s) (2156MB/s) (4711MB/s) READ: (1334MB/s) (235MB/s) (2135MB/s) WRITE: (1335MB/s) (235MB/s) (2137MB/s) READ: (1364MB/s) (236MB/s) (2268MB/s) WRITE: (1365MB/s) (237MB/s) (2270MB/s) WRITE: (5474MB/s) (270MB/s) (4300MB/s) WRITE: (4666MB/s) (266MB/s) (3817MB/s) READ: (2022MB/s) (2319MB/s) (5472MB/s) READ: (1924MB/s) (2260MB/s) (5031MB/s) READ: (1369MB/s) (242MB/s) (2153MB/s) WRITE: (1370MB/s) (242MB/s) (2155MB/s) READ: (1499MB/s) (246MB/s) (2310MB/s) WRITE: (1497MB/s) (246MB/s) (2307MB/s) WRITE: (5558MB/s) (273MB/s) (4439MB/s) WRITE: (4763MB/s) (271MB/s) (3918MB/s) READ: (2201MB/s) (2599MB/s) (6062MB/s) READ: (2105MB/s) (2463MB/s) (5413MB/s) READ: (1490MB/s) (252MB/s) (2238MB/s) WRITE: (1488MB/s) (252MB/s) (2236MB/s) READ: (1566MB/s) (254MB/s) (2434MB/s) WRITE: (1568MB/s) (254MB/s) (2437MB/s) WRITE: (5120MB/s) (264MB/s) (4035MB/s) WRITE: (4531MB/s) (267MB/s) (3740MB/s) READ: (1940MB/s) (2258MB/s) (4986MB/s) READ: (2024MB/s) (2387MB/s) (4871MB/s) READ: (1343MB/s) (246MB/s) (2038MB/s) WRITE: (1342MB/s) (246MB/s) (2037MB/s) READ: (1553MB/s) (238MB/s) (2243MB/s) WRITE: (1552MB/s) (238MB/s) (2242MB/s) WRITE: (5345MB/s) (271MB/s) (3988MB/s) WRITE: (4750MB/s) (254MB/s) (3668MB/s) READ: (1876MB/s) (2363MB/s) (5150MB/s) READ: (1990MB/s) (2256MB/s) (5080MB/s) READ: (1355MB/s) (250MB/s) (2019MB/s) WRITE: (1356MB/s) (251MB/s) (2020MB/s) READ: (1490MB/s) (252MB/s) (2202MB/s) WRITE: (1488MB/s) (252MB/s) (2199MB/s) jobs1 perfstat instructions 52,065,555,710 ( 0.79) 855,731,114,587 ( 2.64) 54,280,709,944 ( 1.40) branches 14,020,427,116 ( 725.847) 101,733,449,582 (1074.521) 11,170,591,067 ( 992.869) branch-misses 22,626,174 ( 0.16%) 274,197,885 ( 0.27%) 25,915,805 ( 0.23%) jobs2 perfstat instructions 103,633,110,402 ( 0.75) 1,710,822,100,914 ( 2.59) 107,879,874,104 ( 1.28) branches 27,931,237,282 ( 679.203) 203,298,267,479 (1037.326) 22,185,350,842 ( 884.427) branch-misses 46,103,811 ( 0.17%) 533,747,204 ( 0.26%) 49,682,483 ( 0.22%) jobs3 perfstat instructions 154,857,283,657 ( 0.76) 2,565,748,974,197 ( 2.57) 161,515,435,813 ( 1.31) branches 41,759,490,355 ( 670.529) 304,905,605,277 ( 978.765) 33,215,805,907 ( 888.003) branch-misses 74,263,293 ( 0.18%) 759,746,240 ( 0.25%) 76,841,196 ( 0.23%) jobs4 perfstat instructions 206,215,849,076 ( 0.75) 3,420,169,460,897 ( 2.60) 215,003,061,664 ( 1.31) branches 55,632,141,739 ( 666.501) 406,394,977,433 ( 927.241) 44,214,322,251 ( 883.532) branch-misses 102,287,788 ( 0.18%) 1,098,617,314 ( 0.27%) 103,891,040 ( 0.23%) jobs5 perfstat instructions 258,711,315,588 ( 0.67) 4,275,657,533,244 ( 2.23) 269,332,235,685 ( 1.08) branches 69,802,821,166 ( 588.823) 507,996,211,252 ( 797.036) 55,450,846,129 ( 735.095) branch-misses 129,217,214 ( 0.19%) 1,243,284,991 ( 0.24%) 173,512,278 ( 0.31%) jobs6 perfstat instructions 312,796,166,008 ( 0.61) 5,133,896,344,660 ( 2.02) 323,658,769,588 ( 1.04) branches 84,372,488,583 ( 520.541) 610,310,494,402 ( 697.642) 66,683,292,992 ( 693.939) branch-misses 159,438,978 ( 0.19%) 1,396,368,563 ( 0.23%) 174,406,934 ( 0.26%) jobs7 perfstat instructions 363,211,372,930 ( 0.56) 5,988,205,600,879 ( 1.75) 377,824,674,156 ( 0.93) branches 98,057,013,765 ( 463.117) 711,841,255,974 ( 598.762) 77,879,009,954 ( 600.443) branch-misses 199,513,153 ( 0.20%) 1,507,651,077 ( 0.21%) 248,203,369 ( 0.32%) jobs8 perfstat instructions 413,960,354,615 ( 0.52) 6,842,918,558,378 ( 1.45) 431,938,486,581 ( 0.83) branches 111,812,574,884 ( 414.224) 813,299,084,518 ( 491.173) 89,062,699,827 ( 517.795) branch-misses 233,584,845 ( 0.21%) 1,531,593,921 ( 0.19%) 286,818,489 ( 0.32%) jobs9 perfstat instructions 465,976,220,300 ( 0.53) 7,698,467,237,372 ( 1.47) 486,352,600,321 ( 0.84) branches 125,931,456,162 ( 424.063) 915,207,005,715 ( 498.192) 100,370,404,090 ( 517.439) branch-misses 256,992,445 ( 0.20%) 1,782,809,816 ( 0.19%) 345,239,380 ( 0.34%) jobs10 perfstat instructions 517,406,372,715 ( 0.53) 8,553,527,312,900 ( 1.48) 540,732,653,094 ( 0.84) branches 139,839,780,676 ( 427.732) 1,016,737,699,389 ( 503.172) 111,696,557,638 ( 516.750) branch-misses 259,595,561 ( 0.19%) 1,952,570,279 ( 0.19%) 357,818,661 ( 0.32%) seconds elapsed 20.630411534 96.084546565 12.743373571 seconds elapsed 22.292627625 100.984155001 14.407413560 seconds elapsed 22.396016966 110.344880848 14.032201392 seconds elapsed 22.517330949 113.351459170 14.243074935 seconds elapsed 28.548305104 156.515193765 19.159286861 seconds elapsed 30.453538116 164.559937678 19.362492717 seconds elapsed 33.467108086 188.486827481 21.492612173 seconds elapsed 35.617727591 209.602677783 23.256422492 seconds elapsed 42.584239509 243.959902566 28.458540338 seconds elapsed 47.683632526 269.635248851 31.542404137 Over all, ZSTD has slower WRITE, but much faster READ (perhaps a static compression buffer used during the test helped ZSTD a lot), which results in faster test results. Memory consumption (zram mm_stat file): zram LZO mm_stat mm_stat (jobs1): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs2): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs3): 2147483648 23068672 33558528 0 33562624 0 0 mm_stat (jobs4): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs5): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs6): 2147483648 23068672 33558528 0 33562624 0 0 mm_stat (jobs7): 2147483648 23068672 33558528 0 33566720 0 0 mm_stat (jobs8): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs9): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs10): 2147483648 23068672 33558528 0 33562624 0 0 zram DEFLATE mm_stat mm_stat (jobs1): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs2): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs3): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs4): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs5): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs6): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs7): 2147483648 16252928 25178112 0 25190400 0 0 mm_stat (jobs8): 2147483648 16252928 25178112 0 25190400 0 0 mm_stat (jobs9): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs10): 2147483648 16252928 25178112 0 25178112 0 0 zram ZSTD mm_stat mm_stat (jobs1): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs2): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs3): 2147483648 11010048 16781312 0 16785408 0 0 mm_stat (jobs4): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs5): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs6): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs7): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs8): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs9): 2147483648 11010048 16781312 0 16785408 0 0 mm_stat (jobs10): 2147483648 11010048 16781312 0 16781312 0 0 ================================================================================== Official benchmarks [1]: Compressor name Ratio Compression Decompress. zstd 1.1.3 -1 2.877 430 MB/s 1110 MB/s zlib 1.2.8 -1 2.743 110 MB/s 400 MB/s brotli 0.5.2 -0 2.708 400 MB/s 430 MB/s quicklz 1.5.0 -1 2.238 550 MB/s 710 MB/s lzo1x 2.09 -1 2.108 650 MB/s 830 MB/s lz4 1.7.5 2.101 720 MB/s 3600 MB/s snappy 1.1.3 2.091 500 MB/s 1650 MB/s lzf 3.6 -1 2.077 400 MB/s 860 MB/s Minchan said: : I did test with my sample data and compared zstd with deflate. zstd's : compress ratio is lower a little bit but compression speed is much faster : 3 times more and decompress speed is too 2 times more. With different : data, it is different but overall, zstd would be better for speed at the : cost of a little lower compress ratio(about 5%) so I believe it's worth to : replace deflate. [1] https://github.com/facebook/zstd Link: http://lkml.kernel.org/r/20170912050005.3247-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Tested-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 5ef3a8b12556d7fcba81edc74e9d85b029615ae0) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ie25565285924304b17b55f8875f9f6f178b6aa02 --- drivers/block/zram/zcomp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 5b8992beffec..cc66daec7bbc 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -31,6 +31,9 @@ static const char * const backends[] = { #endif #if IS_ENABLED(CONFIG_CRYPTO_842) "842", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_ZSTD) + "zstd", #endif NULL }; From 6576d939b169c917039282cdeead56139ddd030e Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Fri, 30 Mar 2018 12:14:53 -0700 Subject: [PATCH 08/37] UPSTREAM: crypto: zstd - Add zstd support Adds zstd support to crypto and scompress. Only supports the default level. Previously we held off on this patch, since there weren't any users. Now zram is ready for zstd support, but depends on CONFIG_CRYPTO_ZSTD, which isn't defined until this patch is in. I also see a patch adding zstd to pstore [0], which depends on crypto zstd. [0] lkml.kernel.org/r/9c9416b2dff19f05fb4c35879aaa83d11ff72c92.1521626182.git.geliangtang@gmail.com Signed-off-by: Nick Terrell Signed-off-by: Herbert Xu (cherry picked from commit d28fc3dbe1918333730d62aa5f0d84b6fb4e7254) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Iaa0afe93fe645b6cf2b8d9e29144096bee8b2c67 --- crypto/Kconfig | 9 ++ crypto/Makefile | 1 + crypto/testmgr.c | 10 ++ crypto/testmgr.h | 71 +++++++++++++ crypto/zstd.c | 265 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 356 insertions(+) create mode 100644 crypto/zstd.c diff --git a/crypto/Kconfig b/crypto/Kconfig index 8239f27299f0..f7bc7e373a46 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1651,6 +1651,15 @@ config CRYPTO_LZ4HC help This is the LZ4 high compression mode algorithm. +config CRYPTO_ZSTD + tristate "Zstd compression algorithm" + select CRYPTO_ALGAPI + select CRYPTO_ACOMP2 + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS + help + This is the zstd algorithm. + comment "Random Number Generation" config CRYPTO_ANSI_CPRNG diff --git a/crypto/Makefile b/crypto/Makefile index 2ae78fb2a390..632d4e7b3a7d 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -136,6 +136,7 @@ obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o +obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o ecdh_generic-y := ecc.o ecdh_generic-y += ecdh.o diff --git a/crypto/testmgr.c b/crypto/testmgr.c index b5bb45a89ff8..ec5d0035d819 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -3639,6 +3639,16 @@ static const struct alg_test_desc alg_test_descs[] = { .decomp = __VECS(zlib_deflate_decomp_tv_template) } } + }, { + .alg = "zstd", + .test = alg_test_comp, + .fips_allowed = 1, + .suite = { + .comp = { + .comp = __VECS(zstd_comp_tv_template), + .decomp = __VECS(zstd_decomp_tv_template) + } + } } }; diff --git a/crypto/testmgr.h b/crypto/testmgr.h index b0de033a5299..5d634ece1c07 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -36124,4 +36124,75 @@ static const struct comp_testvec lz4hc_decomp_tv_template[] = { }, }; +static const struct comp_testvec zstd_comp_tv_template[] = { + { + .inlen = 68, + .outlen = 39, + .input = "The algorithm is zstd. " + "The algorithm is zstd. " + "The algorithm is zstd.", + .output = "\x28\xb5\x2f\xfd\x00\x50\xf5\x00\x00\xb8\x54\x68\x65" + "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73" + "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01" + , + }, + { + .inlen = 244, + .outlen = 151, + .input = "zstd, short for Zstandard, is a fast lossless " + "compression algorithm, targeting real-time " + "compression scenarios at zlib-level and better " + "compression ratios. The zstd compression library " + "provides in-memory compression and decompression " + "functions.", + .output = "\x28\xb5\x2f\xfd\x00\x50\x75\x04\x00\x42\x4b\x1e\x17" + "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32" + "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f" + "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad" + "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60" + "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86" + "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90" + "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64" + "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30" + "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc" + "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e" + "\x20\xa9\x0e\x82\xb9\x43\x45\x01", + }, +}; + +static const struct comp_testvec zstd_decomp_tv_template[] = { + { + .inlen = 43, + .outlen = 68, + .input = "\x28\xb5\x2f\xfd\x04\x50\xf5\x00\x00\xb8\x54\x68\x65" + "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73" + "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01" + "\x6b\xf4\x13\x35", + .output = "The algorithm is zstd. " + "The algorithm is zstd. " + "The algorithm is zstd.", + }, + { + .inlen = 155, + .outlen = 244, + .input = "\x28\xb5\x2f\xfd\x04\x50\x75\x04\x00\x42\x4b\x1e\x17" + "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32" + "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f" + "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad" + "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60" + "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86" + "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90" + "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64" + "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30" + "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc" + "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e" + "\x20\xa9\x0e\x82\xb9\x43\x45\x01\xaa\x6d\xda\x0d", + .output = "zstd, short for Zstandard, is a fast lossless " + "compression algorithm, targeting real-time " + "compression scenarios at zlib-level and better " + "compression ratios. The zstd compression library " + "provides in-memory compression and decompression " + "functions.", + }, +}; #endif /* _CRYPTO_TESTMGR_H */ diff --git a/crypto/zstd.c b/crypto/zstd.c new file mode 100644 index 000000000000..9a76b3ed8b8b --- /dev/null +++ b/crypto/zstd.c @@ -0,0 +1,265 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2017-present, Facebook, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define ZSTD_DEF_LEVEL 3 + +struct zstd_ctx { + ZSTD_CCtx *cctx; + ZSTD_DCtx *dctx; + void *cwksp; + void *dwksp; +}; + +static ZSTD_parameters zstd_params(void) +{ + return ZSTD_getParams(ZSTD_DEF_LEVEL, 0, 0); +} + +static int zstd_comp_init(struct zstd_ctx *ctx) +{ + int ret = 0; + const ZSTD_parameters params = zstd_params(); + const size_t wksp_size = ZSTD_CCtxWorkspaceBound(params.cParams); + + ctx->cwksp = vzalloc(wksp_size); + if (!ctx->cwksp) { + ret = -ENOMEM; + goto out; + } + + ctx->cctx = ZSTD_initCCtx(ctx->cwksp, wksp_size); + if (!ctx->cctx) { + ret = -EINVAL; + goto out_free; + } +out: + return ret; +out_free: + vfree(ctx->cwksp); + goto out; +} + +static int zstd_decomp_init(struct zstd_ctx *ctx) +{ + int ret = 0; + const size_t wksp_size = ZSTD_DCtxWorkspaceBound(); + + ctx->dwksp = vzalloc(wksp_size); + if (!ctx->dwksp) { + ret = -ENOMEM; + goto out; + } + + ctx->dctx = ZSTD_initDCtx(ctx->dwksp, wksp_size); + if (!ctx->dctx) { + ret = -EINVAL; + goto out_free; + } +out: + return ret; +out_free: + vfree(ctx->dwksp); + goto out; +} + +static void zstd_comp_exit(struct zstd_ctx *ctx) +{ + vfree(ctx->cwksp); + ctx->cwksp = NULL; + ctx->cctx = NULL; +} + +static void zstd_decomp_exit(struct zstd_ctx *ctx) +{ + vfree(ctx->dwksp); + ctx->dwksp = NULL; + ctx->dctx = NULL; +} + +static int __zstd_init(void *ctx) +{ + int ret; + + ret = zstd_comp_init(ctx); + if (ret) + return ret; + ret = zstd_decomp_init(ctx); + if (ret) + zstd_comp_exit(ctx); + return ret; +} + +static void *zstd_alloc_ctx(struct crypto_scomp *tfm) +{ + int ret; + struct zstd_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + ret = __zstd_init(ctx); + if (ret) { + kfree(ctx); + return ERR_PTR(ret); + } + + return ctx; +} + +static int zstd_init(struct crypto_tfm *tfm) +{ + struct zstd_ctx *ctx = crypto_tfm_ctx(tfm); + + return __zstd_init(ctx); +} + +static void __zstd_exit(void *ctx) +{ + zstd_comp_exit(ctx); + zstd_decomp_exit(ctx); +} + +static void zstd_free_ctx(struct crypto_scomp *tfm, void *ctx) +{ + __zstd_exit(ctx); + kzfree(ctx); +} + +static void zstd_exit(struct crypto_tfm *tfm) +{ + struct zstd_ctx *ctx = crypto_tfm_ctx(tfm); + + __zstd_exit(ctx); +} + +static int __zstd_compress(const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen, void *ctx) +{ + size_t out_len; + struct zstd_ctx *zctx = ctx; + const ZSTD_parameters params = zstd_params(); + + out_len = ZSTD_compressCCtx(zctx->cctx, dst, *dlen, src, slen, params); + if (ZSTD_isError(out_len)) + return -EINVAL; + *dlen = out_len; + return 0; +} + +static int zstd_compress(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct zstd_ctx *ctx = crypto_tfm_ctx(tfm); + + return __zstd_compress(src, slen, dst, dlen, ctx); +} + +static int zstd_scompress(struct crypto_scomp *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen, + void *ctx) +{ + return __zstd_compress(src, slen, dst, dlen, ctx); +} + +static int __zstd_decompress(const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen, void *ctx) +{ + size_t out_len; + struct zstd_ctx *zctx = ctx; + + out_len = ZSTD_decompressDCtx(zctx->dctx, dst, *dlen, src, slen); + if (ZSTD_isError(out_len)) + return -EINVAL; + *dlen = out_len; + return 0; +} + +static int zstd_decompress(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct zstd_ctx *ctx = crypto_tfm_ctx(tfm); + + return __zstd_decompress(src, slen, dst, dlen, ctx); +} + +static int zstd_sdecompress(struct crypto_scomp *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen, + void *ctx) +{ + return __zstd_decompress(src, slen, dst, dlen, ctx); +} + +static struct crypto_alg alg = { + .cra_name = "zstd", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct zstd_ctx), + .cra_module = THIS_MODULE, + .cra_init = zstd_init, + .cra_exit = zstd_exit, + .cra_u = { .compress = { + .coa_compress = zstd_compress, + .coa_decompress = zstd_decompress } } +}; + +static struct scomp_alg scomp = { + .alloc_ctx = zstd_alloc_ctx, + .free_ctx = zstd_free_ctx, + .compress = zstd_scompress, + .decompress = zstd_sdecompress, + .base = { + .cra_name = "zstd", + .cra_driver_name = "zstd-scomp", + .cra_module = THIS_MODULE, + } +}; + +static int __init zstd_mod_init(void) +{ + int ret; + + ret = crypto_register_alg(&alg); + if (ret) + return ret; + + ret = crypto_register_scomp(&scomp); + if (ret) + crypto_unregister_alg(&alg); + + return ret; +} + +static void __exit zstd_mod_fini(void) +{ + crypto_unregister_alg(&alg); + crypto_unregister_scomp(&scomp); +} + +module_init(zstd_mod_init); +module_exit(zstd_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Zstd Compression Algorithm"); +MODULE_ALIAS_CRYPTO("zstd"); From 9c9d1fe0a25146debe668b7f4a2629d16c4ce537 Mon Sep 17 00:00:00 2001 From: Peter Kalauskas Date: Thu, 23 Aug 2018 16:26:51 -0700 Subject: [PATCH 09/37] ANDROID: x86_64_cuttlefish_defconfig: Enable zram and zstd Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ib4c5d7d9145c0258a956ce0af70d7924f3f2fd96 --- arch/x86/configs/x86_64_cuttlefish_defconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index 19e3a812306b..761739a11425 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -52,6 +52,7 @@ CONFIG_X86_CPUID=y CONFIG_KSM=y CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_ZSMALLOC=y # CONFIG_MTRR is not set CONFIG_HZ_100=y CONFIG_KEXEC=y @@ -198,6 +199,7 @@ CONFIG_DEBUG_DEVRES=y CONFIG_OF=y CONFIG_OF_UNITTEST=y # CONFIG_PNP_DEBUG_MESSAGES is not set +CONFIG_ZRAM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 @@ -452,6 +454,7 @@ CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 CONFIG_CRYPTO_RSA=y # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_ZSTD=y CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y From 7b4e1b649ca74dc118c8d355719237f4dd886fbc Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:32:56 -0800 Subject: [PATCH 10/37] UPSTREAM: zram: set BDI_CAP_STABLE_WRITES once With fast swap storage, the platform wants to use swap more aggressively and swap-in is crucial to application latency. The rw_page() based synchronous devices like zram, pmem and btt are such fast storage. When I profile swapin performance with zram lz4 decompress test, S/W overhead is more than 70%. Maybe, it would be bigger in nvdimm. This patchset reduces swap-in latency by skipping swapcache if the swap device is a synchronous device like a rw_page() based device. It enhances by 45% my swapin test (5G sequential swapin, no readahead) from 2.41sec to 1.64sec. This patch (of 4): Commit 19b7ccf8651d ("block: get rid of blk_integrity_revalidate()") fixed a weird thing (i.e., reset BDI_CAP_STABLE_WRITES flag unconditionally whenever revalidat_disk is called) so zram doesn't need to reset the flag any more when revalidating the bdev. Instead, set the flag just once when the zram device is created. It shouldn't change any behavior. Link: http://lkml.kernel.org/r/1505886205-9671-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Ilya Dryomov Cc: Christoph Hellwig Cc: Dan Williams Cc: Ross Zwisler Cc: Jens Axboe Cc: Hugh Dickins Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit e447a0151f7ce8dd884fea48279274bd64434c29) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I6f1c4eede69eca5fc0af11cfe73c26707d7e4396 --- drivers/block/zram/zram_drv.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f149d3e61234..d95bb8ce5092 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -122,14 +122,6 @@ static inline bool is_partial_io(struct bio_vec *bvec) } #endif -static void zram_revalidate_disk(struct zram *zram) -{ - revalidate_disk(zram->disk); - /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */ - zram->disk->queue->backing_dev_info->capabilities |= - BDI_CAP_STABLE_WRITES; -} - /* * Check if request is within bounds and aligned on zram logical blocks. */ @@ -1373,7 +1365,8 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_revalidate_disk(zram); + + revalidate_disk(zram->disk); up_write(&zram->init_lock); return len; @@ -1420,7 +1413,7 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - zram_revalidate_disk(zram); + revalidate_disk(zram->disk); bdput(bdev); mutex_lock(&bdev->bd_mutex); @@ -1539,6 +1532,7 @@ static int zram_add(void) /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); + /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. @@ -1563,6 +1557,8 @@ static int zram_add(void) if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); + zram->disk->queue->backing_dev_info->capabilities |= + BDI_CAP_STABLE_WRITES; add_disk(zram->disk); ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, From 8ba0238a1911cd7d65e2c693edc9da317425f4a7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 15 Nov 2017 17:37:08 -0800 Subject: [PATCH 11/37] UPSTREAM: drivers/block/zram/zram_drv.c: make zram_page_end_io() static zram_page_end_io() is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'zram_page_end_io' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20171016173336.20320-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 384bc41fc064bd8b12b7081aa3e81d26f3407045) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ic9a9021e9c89759276a1192a4ee0f1a9b6a09a86 --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d95bb8ce5092..36117f649e53 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -428,7 +428,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry) WARN_ON_ONCE(!was_set); } -void zram_page_end_io(struct bio *bio) +static void zram_page_end_io(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; From 4d2ef1fc3b1861e02aa28782a97f18b9fffa4710 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 28 Feb 2018 10:15:30 -0800 Subject: [PATCH 12/37] UPSTREAM: zram: Delete gendisk before cleaning up the request queue Remove the disk, partition and bdi sysfs attributes before cleaning up the request queue associated with the disk. Signed-off-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Reviewed-by: Joseph Qi Reviewed-by: Ming Lei Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Signed-off-by: Jens Axboe (cherry picked from commit 392db38058eb47250a9d0cc737af37e78a7e443d) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I65e8f5e6241bc62ecde17040aae86684e039c737 --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 36117f649e53..472f314dd599 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1620,8 +1620,8 @@ static int zram_remove(struct zram *zram) pr_info("Removed device: %s\n", zram->disk->disk_name); - blk_cleanup_queue(zram->disk->queue); del_gendisk(zram->disk); + blk_cleanup_queue(zram->disk->queue); put_disk(zram->disk); kfree(zram); return 0; From 6e0e675f20368be37011b6f114e627c48bfbe81a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:39 -0700 Subject: [PATCH 13/37] UPSTREAM: zram: correct flag name of ZRAM_ACCESS Patch series "zram memory tracking", v5. zRam as swap is useful for small memory device. However, swap means those pages on zram are mostly cold pages due to VM's LRU algorithm. Especially, once init data for application are touched for launching, they tend to be not accessed any more and finally swapped out. zRAM can store such cold pages as compressed form but it's pointless to keep in memory. As well, it's pointless to store incompressible pages to zram so better idea is app developers manages them directly like free or mlock rather than remaining them on heap. This patch provides a debugfs /sys/kernel/debug/zram/zram0/block_state to represent each block's state so admin can investigate what memory is cold|incompressible|same page with using pagemap once the pages are swapped out. The output is as follows: 300 75.033841 .wh 301 63.806904 s.. 302 63.806919 ..h First column is zram's block index and 3rh one represents symbol (s: same page w: written page to backing store h: huge page) of the block state. Second column represents usec time unit of the block was last accessed. So above example means the 300th block is accessed at 75.033851 second and it was huge so it was written to the backing store. This patch (of 4): ZRAM_ACCESS is used for locking a slot of zram so correct the name. It is also not a common flag to indicate status of the block so move the declare position on top of the flag. Lastly, let's move the function to the top of source code to be able to use it easily without forward declaration. Link: http://lkml.kernel.org/r/20180416090946.63057-2-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit c4d6c4cc7bfd5ecc18548420b7fb9440cf8416ae) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I90a1e061e0ab506bd6759a40460cf4f2dd594d29 --- drivers/block/zram/zram_drv.c | 20 ++++++++++---------- drivers/block/zram/zram_drv.h | 6 +++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 472f314dd599..4c14ce409fca 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -47,6 +47,16 @@ static unsigned int num_devices = 1; static void zram_free_page(struct zram *zram, size_t index); +static void zram_slot_lock(struct zram *zram, u32 index) +{ + bit_spin_lock(ZRAM_LOCK, &zram->table[index].value); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].value); +} + static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -748,16 +758,6 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -static void zram_slot_lock(struct zram *zram, u32 index) -{ - bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value); -} - -static void zram_slot_unlock(struct zram *zram, u32 index) -{ - bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); -} - static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 31762db861e3..a473d5c7d74f 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -60,9 +60,9 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { - /* Page consists the same element */ - ZRAM_SAME = ZRAM_FLAG_SHIFT, - ZRAM_ACCESS, /* page is now accessed */ + /* zram slot is locked */ + ZRAM_LOCK = ZRAM_FLAG_SHIFT, + ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ __NR_ZRAM_PAGEFLAGS, From 97ceebc2928b51984e79f8b70e89de5863fc30b1 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:42 -0700 Subject: [PATCH 14/37] BACKPORT: zram: mark incompressible page as ZRAM_HUGE Mark incompressible pages so that we could investigate who is the owner of the incompressible pages once the page is swapped out via using upcoming zram memory tracker feature. With it, we could prevent such pages to be swapped out by using mlock. Otherwise we might remove them. This patch exposes new stat for huge pages via mm_stat. Link: http://lkml.kernel.org/r/20180416090946.63057-3-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 89e85bce4b02edb7408aebf69d5d1a6692a05f4f) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I62ffb792dc9832af00da255e970788ada69a7f8e --- Documentation/blockdev/zram.txt | 1 + drivers/block/zram/zram_drv.c | 17 ++++++++++++++--- drivers/block/zram/zram_drv.h | 2 ++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 257e65714c6a..78db38d02bc9 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -218,6 +218,7 @@ line of text and contains the following stats separated by whitespace: same_pages the number of same element filled pages written to this disk. No memory is allocated for such pages. pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages 9) Deactivate: swapoff /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 4c14ce409fca..63e14ccafa93 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -724,14 +724,15 @@ static ssize_t mm_stat_show(struct device *dev, max_used = atomic_long_read(&zram->stats.max_used_pages); ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n", + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n", orig_size << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.compr_data_size), mem_used << PAGE_SHIFT, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.same_pages), - pool_stats.pages_compacted); + pool_stats.pages_compacted, + (u64)atomic64_read(&zram->stats.huge_pages)); up_read(&zram->init_lock); return ret; @@ -798,6 +799,11 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; + if (zram_test_flag(zram, index, ZRAM_HUGE)) { + zram_clear_flag(zram, index, ZRAM_HUGE); + atomic64_dec(&zram->stats.huge_pages); + } + if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) { zram_wb_clear(zram, index); atomic64_dec(&zram->stats.pages_stored); @@ -966,6 +972,7 @@ compress_again: } if (unlikely(comp_len > max_zpage_size)) { + comp_len = PAGE_SIZE; if (zram_wb_enabled(zram) && allow_wb) { zcomp_stream_put(zram->comp); ret = write_to_bdev(zram, bvec, index, bio, &element); @@ -977,7 +984,6 @@ compress_again: allow_wb = false; goto compress_again; } - comp_len = PAGE_SIZE; } /* @@ -1039,6 +1045,11 @@ out: zram_slot_lock(zram, index); zram_free_page(zram, index); + if (comp_len == PAGE_SIZE) { + zram_set_flag(zram, index, ZRAM_HUGE); + atomic64_inc(&zram->stats.huge_pages); + } + if (flags) { zram_set_flag(zram, index, flags); zram_set_element(zram, index, element); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index a473d5c7d74f..4c4bc6042c89 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -64,6 +64,7 @@ enum zram_pageflags { ZRAM_LOCK = ZRAM_FLAG_SHIFT, ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ + ZRAM_HUGE, /* Incompressible page */ __NR_ZRAM_PAGEFLAGS, }; @@ -88,6 +89,7 @@ struct zram_stats { atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ atomic64_t same_pages; /* no. of same element filled pages */ + atomic64_t huge_pages; /* no. of huge pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ From 6a83472c2f14e354742c669dd4e9e26763aca430 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:45 -0700 Subject: [PATCH 15/37] UPSTREAM: zram: record accessed second zRam as swap is useful for small memory device. However, swap means those pages on zram are mostly cold pages due to VM's LRU algorithm. Especially, once init data for application are touched for launching, they tend to be not accessed any more and finally swapped out. zRAM can store such cold pages as compressed form but it's pointless to keep in memory. Better idea is app developers free them directly rather than remaining them on heap. This patch records last access time of each block of zram so that With upcoming zram memory tracking, it could help userspace developers to reduce memory footprint. Link: http://lkml.kernel.org/r/20180416090946.63057-4-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit d7eac6b6e1838ef1a1400df4ec55daa34bbc855e) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I34f5de4dd58ba54bb1cc85d558e60231b2810780 --- drivers/block/zram/zram_drv.c | 16 ++++++++++++++++ drivers/block/zram/zram_drv.h | 1 + 2 files changed, 17 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 63e14ccafa93..f05ea784967b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -102,6 +102,16 @@ static inline void zram_set_element(struct zram *zram, u32 index, zram->table[index].element = element; } +static void zram_accessed(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = sched_clock(); +} + +static void zram_reset_access(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = 0; +} + static unsigned long zram_get_element(struct zram *zram, u32 index) { return zram->table[index].element; @@ -799,6 +809,8 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; + zram_reset_access(zram, index); + if (zram_test_flag(zram, index, ZRAM_HUGE)) { zram_clear_flag(zram, index, ZRAM_HUGE); atomic64_dec(&zram->stats.huge_pages); @@ -1170,6 +1182,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time); + zram_slot_lock(zram, index); + zram_accessed(zram, index); + zram_slot_unlock(zram, index); + if (unlikely(ret < 0)) { if (!is_write) atomic64_inc(&zram->stats.failed_reads); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 4c4bc6042c89..79c73f50a2a2 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -78,6 +78,7 @@ struct zram_table_entry { unsigned long element; }; unsigned long value; + u64 ac_time; }; struct zram_stats { From 89d3f7ee7cfe8d2483695e54f1150139529ac0d3 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:49 -0700 Subject: [PATCH 16/37] UPSTREAM: zram: introduce zram memory tracking zRam as swap is useful for small memory device. However, swap means those pages on zram are mostly cold pages due to VM's LRU algorithm. Especially, once init data for application are touched for launching, they tend to be not accessed any more and finally swapped out. zRAM can store such cold pages as compressed form but it's pointless to keep in memory. Better idea is app developers free them directly rather than remaining them on heap. This patch tell us last access time of each block of zram via "cat /sys/kernel/debug/zram/zram0/block_state". The output is as follows, 300 75.033841 .wh 301 63.806904 s.. 302 63.806919 ..h First column is zram's block index and 3rh one represents symbol (s: same page w: written page to backing store h: huge page) of the block state. Second column represents usec time unit of the block was last accessed. So above example means the 300th block is accessed at 75.033851 second and it was huge so it was written to the backing store. Admin can leverage this information to catch cold|incompressible pages of process with *pagemap* once part of heaps are swapped out. I used the feature a few years ago to find memory hoggers in userspace to notify them what memory they have wasted without touch for a long time. With it, they could reduce unnecessary memory space. However, at that time, I hacked up zram for the feature but now I need the feature again so I decided it would be better to upstream rather than keeping it alone. I hope I submit the userspace tool to use the feature soon. [akpm@linux-foundation.org: fix i386 printk warning] [minchan@kernel.org: use ktime_get_boottime() instead of sched_clock()] Link: http://lkml.kernel.org/r/20180420063525.GA253739@rodete-desktop-imager.corp.google.com [akpm@linux-foundation.org: documentation tweak] [akpm@linux-foundation.org: fix i386 printk warning] [minchan@kernel.org: fix compile warning] Link: http://lkml.kernel.org/r/20180508104849.GA8209@rodete-desktop-imager.corp.google.com [rdunlap@infradead.org: fix printk formats] Link: http://lkml.kernel.org/r/3652ccb1-96ef-0b0b-05d1-f661d7733dcc@infradead.org Link: http://lkml.kernel.org/r/20180416090946.63057-5-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Randy Dunlap Reviewed-by: Sergey Senozhatsky Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit c0265342bff4fcaa2cdf13f4596244c18d4a7ae5) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ie091bdfed4c732eaaa9e1561d0f24dc172d3668c --- Documentation/blockdev/zram.txt | 24 ++++++ drivers/block/zram/Kconfig | 14 +++- drivers/block/zram/zram_drv.c | 132 +++++++++++++++++++++++++++++--- drivers/block/zram/zram_drv.h | 7 +- 4 files changed, 163 insertions(+), 14 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 78db38d02bc9..875b2b56b87f 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -243,5 +243,29 @@ to backing storage rather than keeping it in memory. User should set up backing device via /sys/block/zramX/backing_dev before disksize setting. += memory tracking + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows, + + 300 75.033841 .wh + 301 63.806904 s.. + 302 63.806919 ..h + +First column is zram's block index. +Second column is access time since the system was booted +Third column is state of the block. +(s: same page +w: written page to backing store +h: huge page) + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + Nitin Gupta ngupta@vflare.org diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index ac3a31d433b2..635235759a0a 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -13,7 +13,7 @@ config ZRAM It has several use cases, for example: /tmp storage, use as swap disks and maybe many more. - See zram.txt for more information. + See Documentation/blockdev/zram.txt for more information. config ZRAM_WRITEBACK bool "Write back incompressible page to backing device" @@ -25,4 +25,14 @@ config ZRAM_WRITEBACK For this feature, admin should set up backing device via /sys/block/zramX/backing_dev. - See zram.txt for more infomration. + See Documentation/blockdev/zram.txt for more information. + +config ZRAM_MEMORY_TRACKING + bool "Track zRam block status" + depends on ZRAM && DEBUG_FS + help + With this feature, admin can track the state of allocated blocks + of zRAM. Admin could see the information via + /sys/kernel/debug/zram/zramX/block_state. + + See Documentation/blockdev/zram.txt for more information. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f05ea784967b..76c5b2b39781 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "zram_drv.h" @@ -62,6 +63,13 @@ static inline bool init_done(struct zram *zram) return zram->disksize; } +static inline bool zram_allocated(struct zram *zram, u32 index) +{ + + return (zram->table[index].value >> (ZRAM_FLAG_SHIFT + 1)) || + zram->table[index].handle; +} + static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -78,7 +86,7 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) } /* flag operations require table entry bit_spin_lock() being held */ -static int zram_test_flag(struct zram *zram, u32 index, +static bool zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { return zram->table[index].value & BIT(flag); @@ -102,16 +110,6 @@ static inline void zram_set_element(struct zram *zram, u32 index, zram->table[index].element = element; } -static void zram_accessed(struct zram *zram, u32 index) -{ - zram->table[index].ac_time = sched_clock(); -} - -static void zram_reset_access(struct zram *zram, u32 index) -{ - zram->table[index].ac_time = 0; -} - static unsigned long zram_get_element(struct zram *zram, u32 index) { return zram->table[index].element; @@ -615,6 +613,114 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, static void zram_wb_clear(struct zram *zram, u32 index) {} #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + +static struct dentry *zram_debugfs_root; + +static void zram_debugfs_create(void) +{ + zram_debugfs_root = debugfs_create_dir("zram", NULL); +} + +static void zram_debugfs_destroy(void) +{ + debugfs_remove_recursive(zram_debugfs_root); +} + +static void zram_accessed(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = ktime_get_boottime(); +} + +static void zram_reset_access(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = 0; +} + +static ssize_t read_block_state(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t index, written = 0; + struct zram *zram = file->private_data; + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + struct timespec64 ts; + + kbuf = kvmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + kvfree(kbuf); + return -EINVAL; + } + + for (index = *ppos; index < nr_pages; index++) { + int copied; + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + ts = ktime_to_timespec64(zram->table[index].ac_time); + copied = snprintf(kbuf + written, count, + "%12zd %12lld.%06lu %c%c%c\n", + index, (s64)ts.tv_sec, + ts.tv_nsec / NSEC_PER_USEC, + zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', + zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', + zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.'); + + if (count < copied) { + zram_slot_unlock(zram, index); + break; + } + written += copied; + count -= copied; +next: + zram_slot_unlock(zram, index); + *ppos += 1; + } + + up_read(&zram->init_lock); + if (copy_to_user(buf, kbuf, written)) + written = -EFAULT; + kvfree(kbuf); + + return written; +} + +static const struct file_operations proc_zram_block_state_op = { + .open = simple_open, + .read = read_block_state, + .llseek = default_llseek, +}; + +static void zram_debugfs_register(struct zram *zram) +{ + if (!zram_debugfs_root) + return; + + zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name, + zram_debugfs_root); + debugfs_create_file("block_state", 0400, zram->debugfs_dir, + zram, &proc_zram_block_state_op); +} + +static void zram_debugfs_unregister(struct zram *zram) +{ + debugfs_remove_recursive(zram->debugfs_dir); +} +#else +static void zram_debugfs_create(void) {}; +static void zram_debugfs_destroy(void) {}; +static void zram_accessed(struct zram *zram, u32 index) {}; +static void zram_reset_access(struct zram *zram, u32 index) {}; +static void zram_debugfs_register(struct zram *zram) {}; +static void zram_debugfs_unregister(struct zram *zram) {}; +#endif /* * We switched to per-cpu streams and this attr is not needed anymore. @@ -1597,6 +1703,7 @@ static int zram_add(void) } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; @@ -1630,6 +1737,7 @@ static int zram_remove(struct zram *zram) zram->claim = true; mutex_unlock(&bdev->bd_mutex); + zram_debugfs_unregister(zram); /* * Remove sysfs first, so no one will perform a disksize * store while we destroy the devices. This also helps during @@ -1732,6 +1840,7 @@ static void destroy_devices(void) { class_unregister(&zram_control_class); idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + zram_debugfs_destroy(); idr_destroy(&zram_index_idr); unregister_blkdev(zram_major, "zram"); cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); @@ -1753,6 +1862,7 @@ static int __init zram_init(void) return ret; } + zram_debugfs_create(); zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { pr_err("Unable to get major number\n"); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 79c73f50a2a2..bbda650f0dc1 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -78,7 +78,9 @@ struct zram_table_entry { unsigned long element; }; unsigned long value; - u64 ac_time; +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + ktime_t ac_time; +#endif }; struct zram_stats { @@ -127,5 +129,8 @@ struct zram { unsigned long nr_pages; spinlock_t bitmap_lock; #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + struct dentry *debugfs_dir; +#endif }; #endif From 9f09ba3e3116ea36c06f8eaafcf02fa0af55a465 Mon Sep 17 00:00:00 2001 From: Peter Kalauskas Date: Tue, 21 Aug 2018 21:54:02 -0700 Subject: [PATCH 17/37] UPSTREAM: drivers/block/zram/zram_drv.c: fix bug storing backing_dev The call to strlcpy in backing_dev_store is incorrect. It should take the size of the destination buffer instead of the size of the source buffer. Additionally, ignore the newline character (\n) when reading the new file_name buffer. This makes it possible to set the backing_dev as follows: echo /dev/sdX > /sys/block/zram0/backing_dev The reason it worked before was the fact that strlcpy() copies 'len - 1' bytes, which is strlen(buf) - 1 in our case, so it accidentally didn't copy the trailing new line symbol. Which also means that "echo -n /dev/sdX" most likely was broken. Signed-off-by: Peter Kalauskas Link: http://lkml.kernel.org/r/20180813061623.GC64836@rodete-desktop-imager.corp.google.com Acked-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit c8bd134a4bddafe5917d163eea73873932c15e83) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I95e90c5b19981d443669a03de59821fb64b9c0ce --- drivers/block/zram/zram_drv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 76c5b2b39781..4cb422fac081 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -331,6 +331,7 @@ static ssize_t backing_dev_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { char *file_name; + size_t sz; struct file *backing_dev = NULL; struct inode *inode; struct address_space *mapping; @@ -351,7 +352,11 @@ static ssize_t backing_dev_store(struct device *dev, goto out; } - strlcpy(file_name, buf, len); + strlcpy(file_name, buf, PATH_MAX); + /* ignore trailing newline */ + sz = strlen(file_name); + if (sz > 0 && file_name[sz - 1] == '\n') + file_name[sz - 1] = 0x00; backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0); if (IS_ERR(backing_dev)) { From 47350a9f13c63b9b11f3d285f1e373508c983eda Mon Sep 17 00:00:00 2001 From: Peter Kalauskas Date: Fri, 24 Aug 2018 12:27:25 -0700 Subject: [PATCH 18/37] ANDROID: x86_64_cuttlefish_defconfig: Enable lz4 compression for zram Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I999c604d85f3a96bea97829b98101ea865a23275 --- arch/x86/configs/x86_64_cuttlefish_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index 761739a11425..db63c91b57b7 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -454,6 +454,7 @@ CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 CONFIG_CRYPTO_RSA=y # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_LZ4=y CONFIG_CRYPTO_ZSTD=y CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_ASYMMETRIC_KEY_TYPE=y From 1e42bf09df389ee2892632721d6fc1da4f44cd1c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Apr 2018 16:24:43 -0700 Subject: [PATCH 19/37] UPSTREAM: zsmalloc: introduce zs_huge_class_size() Patch series "zsmalloc/zram: drop zram's max_zpage_size", v3. ZRAM's max_zpage_size is a bad thing. It forces zsmalloc to store normal objects as huge ones, which results in bigger zsmalloc memory usage. Drop it and use actual zsmalloc huge-class value when decide if the object is huge or not. This patch (of 2): Not every object can be share its zspage with other objects, e.g. when the object is as big as zspage or nearly as big a zspage. For such objects zsmalloc has a so called huge class - every object which belongs to huge class consumes the entire zspage (which consists of a physical page). On x86_64, PAGE_SHIFT 12 box, the first non-huge class size is 3264, so starting down from size 3264, objects can share page(-s) and thus minimize memory wastage. ZRAM, however, has its own statically defined watermark for huge objects, namely "3 * PAGE_SIZE / 4 = 3072", and forcibly stores every object larger than this watermark (3072) as a PAGE_SIZE object, in other words, to a huge class, while zsmalloc can keep some of those objects in non-huge classes. This results in increased memory consumption. zsmalloc knows better if the object is huge or not. Introduce zs_huge_class_size() function which tells if the given object can be stored in one of non-huge classes or not. This will let us to drop ZRAM's huge object watermark and fully rely on zsmalloc when we decide if the object is huge. [sergey.senozhatsky.work@gmail.com: add pool param to zs_huge_class_size()] Link: http://lkml.kernel.org/r/20180314081833.1096-2-sergey.senozhatsky@gmail.com Link: http://lkml.kernel.org/r/20180306070639.7389-2-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 010b495e2fa32353d0ef6aa70a8169e5ef617a15) Signed-off-by: Peter Kalauskas Bug: 113183619 Change-Id: I842d8234a53f30d2803139107f420f7217d6df6e --- include/linux/zsmalloc.h | 2 ++ mm/zsmalloc.c | 41 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 57a8e98f2708..2219cce81ca4 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -47,6 +47,8 @@ void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); void zs_free(struct zs_pool *pool, unsigned long obj); +size_t zs_huge_class_size(struct zs_pool *pool); + void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm); void zs_unmap_object(struct zs_pool *pool, unsigned long handle); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 685049a9048d..abf1a560fa2b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -190,6 +190,7 @@ static struct vfsmount *zsmalloc_mnt; * (see: fix_fullness_group()) */ static const int fullness_threshold_frac = 4; +static size_t huge_class_size; struct size_class { spinlock_t lock; @@ -1426,6 +1427,25 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); +/** + * zs_huge_class_size() - Returns the size (in bytes) of the first huge + * zsmalloc &size_class. + * @pool: zsmalloc pool to use + * + * The function returns the size of the first huge class - any object of equal + * or bigger size will be stored in zspage consisting of a single physical + * page. + * + * Context: Any context. + * + * Return: the size (in bytes) of the first huge zsmalloc &size_class. + */ +size_t zs_huge_class_size(struct zs_pool *pool) +{ + return huge_class_size; +} +EXPORT_SYMBOL_GPL(zs_huge_class_size); + static unsigned long obj_malloc(struct size_class *class, struct zspage *zspage, unsigned long handle) { @@ -2385,6 +2405,27 @@ struct zs_pool *zs_create_pool(const char *name) pages_per_zspage = get_pages_per_zspage(size); objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; + /* + * We iterate from biggest down to smallest classes, + * so huge_class_size holds the size of the first huge + * class. Any object bigger than or equal to that will + * endup in the huge class. + */ + if (pages_per_zspage != 1 && objs_per_zspage != 1 && + !huge_class_size) { + huge_class_size = size; + /* + * The object uses ZS_HANDLE_SIZE bytes to store the + * handle. We need to subtract it, because zs_malloc() + * unconditionally adds handle size before it performs + * size class search - so object may be smaller than + * huge class size, yet it still can end up in the huge + * class because it grows by ZS_HANDLE_SIZE extra bytes + * right before class lookup. + */ + huge_class_size -= (ZS_HANDLE_SIZE - 1); + } + /* * size_class is used for normal zsmalloc operation such * as alloc/free for that size. Although it is natural that we From e9a6a1abd019da9569225ba88e40e6394e961b8e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Apr 2018 16:24:47 -0700 Subject: [PATCH 20/37] BACKPORT: zram: drop max_zpage_size and use zs_huge_class_size() Remove ZRAM's enforced "huge object" value and use zsmalloc huge-class watermark instead, which makes more sense. TEST - I used a 1G zram device, LZO compression back-end, original data set size was 444MB. Looking at zsmalloc classes stats the test ended up to be pretty fair. BASE ZRAM/ZSMALLOC ===================== zram mm_stat 498978816 191482495 199831552 0 199831552 15634 0 zsmalloc classes class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable ... 151 2448 0 0 1240 1240 744 3 0 168 2720 0 0 4200 4200 2800 2 0 190 3072 0 0 10100 10100 7575 3 0 202 3264 0 0 380 380 304 4 0 254 4096 0 0 10620 10620 10620 1 0 Total 7 46 106982 106187 48787 0 PATCHED ZRAM/ZSMALLOC ===================== zram mm_stat 498978816 182579184 194248704 0 194248704 15628 0 zsmalloc classes class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable ... 151 2448 0 0 1240 1240 744 3 0 168 2720 0 0 4200 4200 2800 2 0 190 3072 0 0 10100 10100 7575 3 0 202 3264 0 0 7180 7180 5744 4 0 254 4096 0 0 3820 3820 3820 1 0 Total 8 45 106959 106193 47424 0 As we can see, we reduced the number of objects stored in class-4096, because a huge number of objects which we previously forcibly stored in class-4096 now stored in non-huge class-3264. This results in lower memory consumption: - zsmalloc now uses 47424 physical pages, which is less than 48787 pages zsmalloc used before. - objects that we store in class-3264 share zspages. That's why overall the number of pages that both class-4096 and class-3264 consumed went down from 10924 to 9564. [sergey.senozhatsky.work@gmail.com: add pool param to zs_huge_class_size()] Link: http://lkml.kernel.org/r/20180314081833.1096-3-sergey.senozhatsky@gmail.com Link: http://lkml.kernel.org/r/20180306070639.7389-3-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 60f5921a9a4f126e081318bd6bb2bc2798b7bba8) Signed-off-by: Peter Kalauskas Bug: 113183619 Change-Id: I926cb873af0718d5392f8daaed1d7e592455d589 --- drivers/block/zram/zram_drv.c | 10 ++++++++-- drivers/block/zram/zram_drv.h | 16 ---------------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 4cb422fac081..06f2623d0199 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -45,6 +45,11 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +/* + * Pages that compress to sizes equals or greater than this are stored + * uncompressed in memory. + */ +static size_t huge_class_size; static void zram_free_page(struct zram *zram, size_t index); @@ -908,6 +913,8 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) return false; } + if (!huge_class_size) + huge_class_size = zs_huge_class_size(zram->mem_pool); return true; } @@ -1094,8 +1101,7 @@ compress_again: return ret; } - if (unlikely(comp_len > max_zpage_size)) { - comp_len = PAGE_SIZE; + if (unlikely(comp_len >= huge_class_size)) { if (zram_wb_enabled(zram) && allow_wb) { zcomp_stream_put(zram->comp); ret = write_to_bdev(zram, bvec, index, bio, &element); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index bbda650f0dc1..3a1cac486e96 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -21,22 +21,6 @@ #include "zcomp.h" -/*-- Configurable parameters */ - -/* - * Pages that compress to size greater than this are stored - * uncompressed in memory. - */ -static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; - -/* - * NOTE: max_zpage_size must be less than or equal to: - * ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would - * always return failure. - */ - -/*-- End of configurable params */ - #define SECTOR_SHIFT 9 #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) From c687b89c2f0649a643aaa0f6c86bf96d29018c21 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 29 Nov 2017 18:23:56 +0100 Subject: [PATCH 21/37] UPSTREAM: xfrm: fix XFRMA_OUTPUT_MARK policy entry This seems to be an obvious typo, NLA_U32 is type of the attribute, not its (minimal) length. Fixes: 077fbac405bf ("net: xfrm: support setting an output mark.") Signed-off-by: Michal Kubecek Signed-off-by: Steffen Klassert (cherry picked from commit e719135881f00c01ca400abb8a5dadaf297a24f9) Signed-off-by: Benedict Wong Bug: 113046120 Change-Id: I4c1a8de03febfa246b99c7eb67d77f74a1e3ba93 --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index dde40f995ac0..0359bc047eb6 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2487,7 +2487,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, - [XFRMA_OUTPUT_MARK] = { .len = NLA_U32 }, + [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { From 8508afd498691dfe43b90cf706b612e3b915888a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 12:44:26 +0200 Subject: [PATCH 22/37] UPSTREAM: xfrm: Extend the output_mark to support input direction and masking. We already support setting an output mark at the xfrm_state, unfortunately this does not support the input direction and masking the marks that will be applied to the skb. This change adds support applying a masked value in both directions. The existing XFRMA_OUTPUT_MARK number is reused for this purpose and as it is now bi-directional, it is renamed to XFRMA_SET_MARK. An additional XFRMA_SET_MARK_MASK attribute is added for setting the mask. If the attribute mask not provided, it is set to 0xffffffff, keeping the XFRMA_OUTPUT_MARK existing 'full mask' semantics. Co-developed-by: Tobias Brunner Co-developed-by: Eyal Birger Co-developed-by: Lorenzo Colitti Signed-off-by: Steffen Klassert Signed-off-by: Tobias Brunner Signed-off-by: Eyal Birger Signed-off-by: Lorenzo Colitti (cherry picked from commit 9b42c1f179a614e11893ae4619f0304a38f481ae) Signed-off-by: Benedict Wong Bug: 113046120 Change-Id: I582f0b460dc58f01e0c30afb6167725aa337d054 --- include/net/xfrm.h | 9 +++++++- include/uapi/linux/xfrm.h | 4 +++- net/xfrm/xfrm_device.c | 3 ++- net/xfrm/xfrm_input.c | 2 ++ net/xfrm/xfrm_output.c | 3 +-- net/xfrm/xfrm_policy.c | 5 ++-- net/xfrm/xfrm_user.c | 48 +++++++++++++++++++++++++++++++-------- 7 files changed, 57 insertions(+), 17 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index db99efb2d1d0..404769f00120 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -166,7 +166,7 @@ struct xfrm_state { int header_len; int trailer_len; u32 extra_flags; - u32 output_mark; + struct xfrm_mark smark; } props; struct xfrm_lifetime_cfg lft; @@ -1948,6 +1948,13 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m) return ret; } +static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x) +{ + struct xfrm_mark *m = &x->props.smark; + + return (m->v & m->m) | (mark & ~m->m); +} + static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, unsigned int family) { diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index e3af2859188b..5a6ed7ce5a29 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -305,9 +305,11 @@ enum xfrm_attr_type_t { XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ XFRMA_PAD, XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ - XFRMA_OUTPUT_MARK, /* __u32 */ + XFRMA_SET_MARK, /* __u32 */ + XFRMA_SET_MARK_MASK, /* __u32 */ __XFRMA_MAX +#define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ #define XFRMA_MAX (__XFRMA_MAX - 1) }; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 30e5746085b8..4a6859e6d0fc 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -80,7 +80,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, - x->props.family, x->props.output_mark); + x->props.family, + xfrm_smark_get(0, x)); if (IS_ERR(dst)) return 0; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 9f492dc417d5..dc8beae79087 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -339,6 +339,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } + skb->mark = xfrm_smark_get(skb->mark, x); + skb->sp->xvec[skb->sp->len++] = x; lock: diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 35610cc881a9..0f264049df82 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -66,8 +66,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) goto error_nolock; } - if (x->props.output_mark) - skb->mark = x->props.output_mark; + skb->mark = xfrm_smark_get(skb->mark, x); err = x->outer_mode->output(x, skb); if (err) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9c57d6a5816c..0c0039633e72 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1605,10 +1605,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { + __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); + family = xfrm[i]->props.family; dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, - &saddr, &daddr, family, - xfrm[i]->props.output_mark); + &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0359bc047eb6..203a2a4a0b2d 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -527,6 +527,19 @@ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs, x->replay_maxdiff = nla_get_u32(rt); } +static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m) +{ + if (attrs[XFRMA_SET_MARK]) { + m->v = nla_get_u32(attrs[XFRMA_SET_MARK]); + if (attrs[XFRMA_SET_MARK_MASK]) + m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]); + else + m->m = 0xffffffff; + } else { + m->v = m->m = 0; + } +} + static struct xfrm_state *xfrm_state_construct(struct net *net, struct xfrm_usersa_info *p, struct nlattr **attrs, @@ -579,8 +592,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_mark_get(attrs, &x->mark); - if (attrs[XFRMA_OUTPUT_MARK]) - x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]); + xfrm_smark_init(attrs, &x->props.smark); err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) @@ -820,6 +832,18 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) return 0; } +static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m) +{ + int ret = 0; + + if (m->v | m->m) { + ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v); + if (!ret) + ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m); + } + return ret; +} + /* Don't change this without updating xfrm_sa_len! */ static int copy_to_user_state_extra(struct xfrm_state *x, struct xfrm_usersa_info *p, @@ -883,6 +907,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, ret = xfrm_mark_put(skb, &x->mark); if (ret) goto out; + + ret = xfrm_smark_put(skb, &x->props.smark); + if (ret) + goto out; + if (x->replay_esn) ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL, xfrm_replay_state_esn_len(x->replay_esn), @@ -896,11 +925,7 @@ static int copy_to_user_state_extra(struct xfrm_state *x, ret = copy_user_offload(&x->xso, skb); if (ret) goto out; - if (x->props.output_mark) { - ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark); - if (ret) - goto out; - } + if (x->security) ret = copy_sec_ctx(x->security, skb); out: @@ -2487,7 +2512,8 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, - [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 }, + [XFRMA_SET_MARK] = { .type = NLA_U32 }, + [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2712,8 +2738,10 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(x->props.extra_flags)); if (x->xso.dev) l += nla_total_size(sizeof(x->xso)); - if (x->props.output_mark) - l += nla_total_size(sizeof(x->props.output_mark)); + if (x->props.smark.v | x->props.smark.m) { + l += nla_total_size(sizeof(x->props.smark.v)); + l += nla_total_size(sizeof(x->props.smark.m)); + } /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); From 62b23f0e2cc6a67203783b254ceb38ab6edd3282 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:06:57 +0200 Subject: [PATCH 23/37] UPSTREAM: flow: Extend flow informations with xfrm interface id. Add a new flowi_xfrm structure with informations needed to do a xfrm lookup. At the moment it keeps the informations about the new xfrm interface id needed to lookup xfrm interfaces that are introduced with a followup patch. We need this new lookup key as other possible keys, like the ifindex is already part of the xfrm selector and used as a key to enforce the output device after the transformation in the policy/state lookup. Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Acked-by: Benedict Wong Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger (cherry picked from commit d159ce7957eec306eacda672e5909e26675ca8ef) Signed-off-by: Benedict Wong Bug: 113046120 Change-Id: I70b520d3cf67cd663e84868b0e7cc45ffa74d080 --- include/net/flow.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/net/flow.h b/include/net/flow.h index f1624fd5b1d0..6c2272213a6b 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -26,6 +26,10 @@ struct flowi_tunnel { __be64 tun_id; }; +struct flowi_xfrm { + __u32 if_id; +}; + struct flowi_common { int flowic_oif; int flowic_iif; @@ -39,6 +43,7 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; + struct flowi_xfrm xfrm; kuid_t flowic_uid; }; @@ -78,6 +83,7 @@ struct flowi4 { #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid +#define flowi4_xfrm __fl_common.xfrm /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -109,6 +115,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; + fl4->flowi4_xfrm.if_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; @@ -138,6 +145,7 @@ struct flowi6 { #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid +#define flowi6_xfrm __fl_common.xfrm struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ @@ -185,6 +193,7 @@ struct flowi { #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid +#define flowi_xfrm u.__fl_common.xfrm } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) From a5041fd6233271cdad0f64b617b02f12018ab1cc Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:07:07 +0200 Subject: [PATCH 24/37] UPSTREAM: xfrm: Add a new lookup key to match xfrm interfaces. This patch adds the xfrm interface id as a lookup key for xfrm states and policies. With this we can assign states and policies to virtual xfrm interfaces. Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Acked-by: Benedict Wong Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger (cherry picked from commit 7e6526404adedf079279aa7aa11722deaca8fe2e) Signed-off-by: Benedict Wong Bug: 113046120 Change-Id: I27d7757a374b0bd5f97c3e723773d6c7470a0717 --- include/net/xfrm.h | 21 +++++++++++---- include/uapi/linux/xfrm.h | 1 + net/core/pktgen.c | 2 +- net/key/af_key.c | 6 ++--- net/xfrm/xfrm_policy.c | 18 +++++++++---- net/xfrm/xfrm_state.c | 19 ++++++++++---- net/xfrm/xfrm_user.c | 54 ++++++++++++++++++++++++++++++++++----- 7 files changed, 96 insertions(+), 25 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 404769f00120..687750094fcd 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -147,6 +147,7 @@ struct xfrm_state { struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; + u32 if_id; u32 tfcpad; u32 genid; @@ -573,6 +574,7 @@ struct xfrm_policy { atomic_t genid; u32 priority; u32 index; + u32 if_id; struct xfrm_mark mark; struct xfrm_selector selector; struct xfrm_lifetime_cfg lft; @@ -1501,7 +1503,7 @@ struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family); -struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, +struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, @@ -1658,20 +1660,20 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, void *); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); -struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, +struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err); -struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir, - u32 id, int delete, int *err); +struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, u8, + int dir, u32 id, int delete, int *err); int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); int verify_spi_info(u8 proto, u32 min, u32 max); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u8 proto, + u8 mode, u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); @@ -1955,6 +1957,15 @@ static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x) return (m->v & m->m) | (mark & ~m->m); } +static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id) +{ + int ret = 0; + + if (if_id) + ret = nla_put_u32(skb, XFRMA_IF_ID, if_id); + return ret; +} + static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, unsigned int family) { diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 5a6ed7ce5a29..5f3b9fec7b5f 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -307,6 +307,7 @@ enum xfrm_attr_type_t { XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ XFRMA_SET_MARK, /* __u32 */ XFRMA_SET_MARK_MASK, /* __u32 */ + XFRMA_IF_ID, /* __u32 */ __XFRMA_MAX #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 6e1e10ff433a..3bc92c1b7e62 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2345,7 +2345,7 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); } else { /* slow path: we dont already have xfrm_state */ - x = xfrm_stateonly_find(pn->net, DUMMY_MARK, + x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0, (xfrm_address_t *)&pkt_dev->cur_daddr, (xfrm_address_t *)&pkt_dev->cur_saddr, AF_INET, diff --git a/net/key/af_key.c b/net/key/af_key.c index 3b209cbfe1df..294b68e3a37a 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1383,7 +1383,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ } if (!x) - x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family); + x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; @@ -2412,7 +2412,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa return err; } - xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN, + xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN, pol->sadb_x_policy_dir - 1, &sel, pol_ctx, 1, &err); security_xfrm_policy_free(pol_ctx); @@ -2661,7 +2661,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_ return -EINVAL; delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2); - xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN, + xp = xfrm_policy_byid(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN, dir, pol->sadb_x_policy_id, delete, &err); if (xp == NULL) return -ENOENT; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0c0039633e72..a3cd67ca24eb 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -748,6 +748,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) newpos = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && + pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(policy, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && @@ -799,8 +800,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } EXPORT_SYMBOL(xfrm_policy_insert); -struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, - int dir, struct xfrm_selector *sel, +struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, + u8 type, int dir, + struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { @@ -813,6 +815,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, ret = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && + pol->if_id == if_id && (mark & pol->mark.m) == pol->mark.v && !selector_cmp(sel, &pol->selector) && xfrm_sec_ctx_match(ctx, pol->security)) { @@ -838,8 +841,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); -struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, - int dir, u32 id, int delete, int *err) +struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, + u8 type, int dir, u32 id, int delete, + int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; @@ -854,6 +858,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, ret = NULL; hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && + pol->if_id == if_id && (mark & pol->mark.m) == pol->mark.v) { xfrm_pol_hold(pol); if (delete) { @@ -1064,6 +1069,7 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, bool match; if (pol->family != family || + pol->if_id != fl->flowi_xfrm.if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; @@ -1178,7 +1184,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, match = xfrm_selector_match(&pol->selector, fl, family); if (match) { - if ((sk->sk_mark & pol->mark.m) != pol->mark.v) { + if ((sk->sk_mark & pol->mark.m) != pol->mark.v || + pol->if_id != fl->flowi_xfrm.if_id) { pol = NULL; goto out; } @@ -1306,6 +1313,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) newp->lft = old->lft; newp->curlft = old->curlft; newp->mark = old->mark; + newp->if_id = old->if_id; newp->action = old->action; newp->flags = old->flags; newp->xfrm_nr = old->xfrm_nr; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 6c4ec69e11a0..a065ca04e4f1 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -941,6 +941,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, int error = 0; struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; + u32 if_id = fl->flowi_xfrm.if_id; unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; @@ -955,6 +956,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, encap_family) && tmpl->mode == x->props.mode && @@ -971,6 +973,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_addr_equal(&x->id.daddr, daddr, encap_family) && tmpl->mode == x->props.mode && @@ -1010,6 +1013,7 @@ found: * to current session. */ xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); memcpy(&x->mark, &pol->mark, sizeof(x->mark)); + x->if_id = if_id; error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); if (error) { @@ -1067,7 +1071,7 @@ out: } struct xfrm_state * -xfrm_stateonly_find(struct net *net, u32 mark, +xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, u8 mode, u8 proto, u32 reqid) { @@ -1080,6 +1084,7 @@ xfrm_stateonly_find(struct net *net, u32 mark, if (x->props.family == family && x->props.reqid == reqid && (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, family) && mode == x->props.mode && @@ -1160,11 +1165,13 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) struct xfrm_state *x; unsigned int h; u32 mark = xnew->mark.v & xnew->mark.m; + u32 if_id = xnew->if_id; h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && + x->if_id == if_id && (mark & x->mark.m) == x->mark.v && xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) @@ -1187,7 +1194,7 @@ EXPORT_SYMBOL(xfrm_state_insert); static struct xfrm_state *__find_acq_core(struct net *net, const struct xfrm_mark *m, unsigned short family, u8 mode, - u32 reqid, u8 proto, + u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create) @@ -1242,6 +1249,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, x->props.family = family; x->props.mode = mode; x->props.reqid = reqid; + x->if_id = if_id; x->mark.v = m->v; x->mark.m = m->m; x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; @@ -1296,7 +1304,7 @@ int xfrm_state_add(struct xfrm_state *x) if (use_spi && !x1) x1 = __find_acq_core(net, &x->mark, family, x->props.mode, - x->props.reqid, x->id.proto, + x->props.reqid, x->if_id, x->id.proto, &x->id.daddr, &x->props.saddr, 0); __xfrm_state_bump_genids(x); @@ -1395,6 +1403,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; + x->if_id = orig->if_id; x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; x->replay_maxage = orig->replay_maxage; @@ -1613,13 +1622,13 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr); struct xfrm_state * xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, - u8 proto, const xfrm_address_t *daddr, + u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create); + x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 203a2a4a0b2d..fea06162d3ed 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -594,6 +594,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_smark_init(attrs, &x->props.smark); + if (attrs[XFRMA_IF_ID]) + x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) goto error; @@ -925,7 +928,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, ret = copy_user_offload(&x->xso, skb); if (ret) goto out; - + if (x->if_id) { + ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id); + if (ret) + goto out; + } if (x->security) ret = copy_sec_ctx(x->security, skb); out: @@ -1272,6 +1279,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, int err; u32 mark; struct xfrm_mark m; + u32 if_id = 0; p = nlmsg_data(nlh); err = verify_spi_info(p->info.id.proto, p->min, p->max); @@ -1284,6 +1292,10 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, x = NULL; mark = xfrm_mark_get(attrs, &m); + + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) { @@ -1294,7 +1306,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (!x) x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid, - p->info.id.proto, daddr, + if_id, p->info.id.proto, daddr, &p->info.saddr, 1, family); err = -ENOENT; @@ -1582,6 +1594,9 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us xfrm_mark_get(attrs, &xp->mark); + if (attrs[XFRMA_IF_ID]) + xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + return xp; error: *errp = err; @@ -1729,6 +1744,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -1810,6 +1827,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, int delete; struct xfrm_mark m; u32 mark = xfrm_mark_get(attrs, &m); + u32 if_id = 0; p = nlmsg_data(nlh); delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY; @@ -1822,8 +1840,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (p->index) - xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err); + xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, delete, &err); else { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_sec_ctx *ctx; @@ -1840,7 +1861,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; } - xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel, + xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel, ctx, delete, &err); security_xfrm_policy_free(ctx); } @@ -1963,6 +1984,10 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) goto out_cancel; + err = xfrm_if_id_put(skb, x->if_id); + if (err) + goto out_cancel; + nlmsg_end(skb, nlh); return 0; @@ -2104,6 +2129,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, int err = -ENOENT; struct xfrm_mark m; u32 mark = xfrm_mark_get(attrs, &m); + u32 if_id = 0; err = copy_from_user_policy_type(&type, attrs); if (err) @@ -2113,8 +2139,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (p->index) - xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err); + xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, 0, &err); else { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_sec_ctx *ctx; @@ -2131,7 +2160,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; } - xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, + xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel, ctx, 0, &err); security_xfrm_policy_free(ctx); } @@ -2514,6 +2543,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, [XFRMA_SET_MARK] = { .type = NLA_U32 }, [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, + [XFRMA_IF_ID] = { .type = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2645,6 +2675,10 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) return err; + err = xfrm_if_id_put(skb, x->if_id); + if (err) + return err; + nlmsg_end(skb, nlh); return 0; } @@ -2742,6 +2776,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(x->props.smark.v)); l += nla_total_size(sizeof(x->props.smark.m)); } + if (x->if_id) + l += nla_total_size(sizeof(x->if_id)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); @@ -2870,6 +2906,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -2985,6 +3023,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -3064,6 +3104,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) goto out_free_skb; From 73d9837ccf12f353d7c5bdcf84ebbe6d0c3b2015 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:07:12 +0200 Subject: [PATCH 25/37] UPSTREAM: xfrm: Add virtual xfrm interfaces This patch adds support for virtual xfrm interfaces. Packets that are routed through such an interface are guaranteed to be IPsec transformed or dropped. It is a generic virtual interface that ensures IPsec transformation, no need to know what happens behind the interface. This means that we can tunnel IPv4 and IPv6 through the same interface and support all xfrm modes (tunnel, transport and beet) on it. Co-developed-by: Lorenzo Colitti Co-developed-by: Benedict Wong Signed-off-by: Lorenzo Colitti Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger (cherry picked from commit f203b76d78092faf248db3f851840fbecf80b40e) Bug: 113046120 Change-Id: I05e8fe1e8a8a4b01886504ce694ddda29e4fbec6 --- include/net/xfrm.h | 24 + include/uapi/linux/if_link.h | 10 + net/xfrm/Kconfig | 8 + net/xfrm/Makefile | 1 + net/xfrm/xfrm_input.c | 3 + net/xfrm/xfrm_interface.c | 972 +++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_policy.c | 43 ++ 7 files changed, 1061 insertions(+) create mode 100644 net/xfrm/xfrm_interface.c diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 687750094fcd..2ee065ec429c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -293,6 +294,13 @@ struct xfrm_replay { int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); }; +struct xfrm_if_cb { + struct xfrm_if *(*decode_session)(struct sk_buff *skb); +}; + +void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); +void xfrm_if_unregister_cb(void); + struct net_device; struct xfrm_type; struct xfrm_dst; @@ -1008,6 +1016,22 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); +struct xfrm_if_parms { + char name[IFNAMSIZ]; /* name of XFRM device */ + int link; /* ifindex of underlying L2 interface */ + u32 if_id; /* interface identifyer */ +}; + +struct xfrm_if { + struct xfrm_if __rcu *next; /* next interface in list */ + struct net_device *dev; /* virtual device associated with interface */ + struct net_device *phydev; /* physical device */ + struct net *net; /* netns for packet i/o */ + struct xfrm_if_parms p; /* interface parms */ + + struct gro_cells gro_cells; +}; + struct xfrm_offload { /* Output sequence number for replay protection on offloading. */ struct { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 1f00f0cd6790..b9a7c97f3cc2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -451,6 +451,16 @@ enum { #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + enum macsec_validation_type { MACSEC_VALIDATE_DISABLED = 0, MACSEC_VALIDATE_CHECK = 1, diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 286ed25c1a69..53381888a7b3 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -25,6 +25,14 @@ config XFRM_USER If unsure, say Y. +config XFRM_INTERFACE + tristate "Transformation virtual interface" + depends on XFRM && IPV6 + ---help--- + This provides a virtual interface to route IPsec traffic. + + If unsure, say N. + config XFRM_SUB_POLICY bool "Transformation sub policy support" depends on XFRM diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 0bd2465a8c5a..fbc4552d17b8 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o +obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index dc8beae79087..a6f6510dd121 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -320,6 +320,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) seq = 0; if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { + secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } @@ -328,12 +329,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_SPI_SKB_CB(skb)->daddroff); do { if (skb->sp->len == XFRM_MAX_DEPTH) { + secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { + secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound(skb, family, spi, seq); goto drop; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c new file mode 100644 index 000000000000..31cb1c7e3881 --- /dev/null +++ b/net/xfrm/xfrm_interface.c @@ -0,0 +1,972 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XFRM virtual interface + * + * Copyright (C) 2018 secunet Security Networks AG + * + * Author: + * Steffen Klassert + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int xfrmi_dev_init(struct net_device *dev); +static void xfrmi_dev_setup(struct net_device *dev); +static struct rtnl_link_ops xfrmi_link_ops __read_mostly; +static unsigned int xfrmi_net_id __read_mostly; + +struct xfrmi_net { + /* lists for storing interfaces in use */ + struct xfrm_if __rcu *xfrmi[1]; +}; + +#define for_each_xfrmi_rcu(start, xi) \ + for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) + +static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) +{ + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if *xi; + + for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { + if (x->if_id == xi->p.if_id && + (xi->dev->flags & IFF_UP)) + return xi; + } + + return NULL; +} + +static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb) +{ + struct xfrmi_net *xfrmn; + int ifindex; + struct xfrm_if *xi; + + if (!skb->dev) + return NULL; + + xfrmn = net_generic(dev_net(skb->dev), xfrmi_net_id); + ifindex = skb->dev->ifindex; + + for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { + if (ifindex == xi->dev->ifindex && + (xi->dev->flags & IFF_UP)) + return xi; + } + + return NULL; +} + +static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) +{ + struct xfrm_if __rcu **xip = &xfrmn->xfrmi[0]; + + rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); + rcu_assign_pointer(*xip, xi); +} + +static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) +{ + struct xfrm_if __rcu **xip; + struct xfrm_if *iter; + + for (xip = &xfrmn->xfrmi[0]; + (iter = rtnl_dereference(*xip)) != NULL; + xip = &iter->next) { + if (xi == iter) { + rcu_assign_pointer(*xip, xi->next); + break; + } + } +} + +static void xfrmi_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); +} + +static int xfrmi_create2(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net *net = dev_net(dev); + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + int err; + + dev->rtnl_link_ops = &xfrmi_link_ops; + err = register_netdevice(dev); + if (err < 0) + goto out; + + strcpy(xi->p.name, dev->name); + + dev_hold(dev); + xfrmi_link(xfrmn, xi); + + return 0; + +out: + return err; +} + +static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p) +{ + struct net_device *dev; + struct xfrm_if *xi; + char name[IFNAMSIZ]; + int err; + + if (p->name[0]) + strlcpy(name, p->name, IFNAMSIZ); + else + goto failed; + + dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup); + if (!dev) + goto failed; + + dev_net_set(dev, net); + + xi = netdev_priv(dev); + xi->p = *p; + xi->net = net; + xi->dev = dev; + xi->phydev = dev_get_by_index(net, p->link); + if (!xi->phydev) + goto failed_free; + + err = xfrmi_create2(dev); + if (err < 0) + goto failed_dev_put; + + return xi; + +failed_dev_put: + dev_put(xi->phydev); +failed_free: + free_netdev(dev); +failed: + return NULL; +} + +static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p, + int create) +{ + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + + for (xip = &xfrmn->xfrmi[0]; + (xi = rtnl_dereference(*xip)) != NULL; + xip = &xi->next) { + if (xi->p.if_id == p->if_id) { + if (create) + return NULL; + + return xi; + } + } + if (!create) + return NULL; + return xfrmi_create(net, p); +} + +static void xfrmi_dev_uninit(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); + + xfrmi_unlink(xfrmn, xi); + dev_put(xi->phydev); + dev_put(dev); +} + +static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) +{ + skb->tstamp = 0; + skb->pkt_type = PACKET_HOST; + skb->skb_iif = 0; + skb->ignore_df = 0; + skb_dst_drop(skb); + nf_reset(skb); + nf_reset_trace(skb); + + if (!xnet) + return; + + ipvs_reset(skb); + secpath_reset(skb); + skb_orphan(skb); + skb->mark = 0; +} + +static int xfrmi_rcv_cb(struct sk_buff *skb, int err) +{ + struct pcpu_sw_netstats *tstats; + struct xfrm_mode *inner_mode; + struct net_device *dev; + struct xfrm_state *x; + struct xfrm_if *xi; + bool xnet; + + if (err && !skb->sp) + return 0; + + x = xfrm_input_state(skb); + + xi = xfrmi_lookup(xs_net(x), x); + if (!xi) + return 1; + + dev = xi->dev; + skb->dev = dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; + } + + xnet = !net_eq(xi->net, dev_net(skb->dev)); + + if (xnet) { + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, + inner_mode->afinfo->family)) + return -EPERM; + } + + xfrmi_scrub_packet(skb, xnet); + + tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + return 0; +} + +static int +xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); + unsigned int length = skb->len; + struct net_device *tdev; + struct xfrm_state *x; + int err = -1; + int mtu; + + if (!dst) + goto tx_err_link_failure; + + fl->flowi_xfrm.if_id = xi->p.if_id; + + dst_hold(dst); + dst = xfrm_lookup(xi->net, dst, fl, NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto tx_err_link_failure; + } + + x = dst->xfrm; + if (!x) + goto tx_err_link_failure; + + if (x->if_id != xi->p.if_id) + goto tx_err_link_failure; + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + net_warn_ratelimited("%s: Local routing loop detected!\n", + xi->p.name); + goto tx_err_dst_release; + } + + mtu = dst_mtu(dst); + if (!skb->ignore_df && skb->len > mtu) { + skb_dst_update_pmtu(skb, mtu); + + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } else { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + } + + dst_release(dst); + return -EMSGSIZE; + } + + xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); + skb_dst_set(skb, dst); + skb->dev = tdev; + + err = dst_output(xi->net, skb->sk, skb); + if (net_xmit_eval(err) == 0) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += length; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(dst); + return err; +} + +static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device_stats *stats = &xi->dev->stats; + struct flowi fl; + int ret; + + memset(&fl, 0, sizeof(fl)); + + switch (skb->protocol) { + case htons(ETH_P_IPV6): + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + break; + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + break; + default: + goto tx_err; + } + + fl.flowi_oif = xi->phydev->ifindex; + + ret = xfrmi_xmit2(skb, dev, &fl); + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int xfrmi4_err(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct net *net = dev_net(skb->dev); + int protocol = iph->protocol; + struct ip_comp_hdr *ipch; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah ; + struct xfrm_state *x; + struct xfrm_if *xi; + __be32 spi; + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + case ICMP_REDIRECT: + break; + default: + return 0; + } + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET); + if (!x) + return 0; + + xi = xfrmi_lookup(net, x); + if (!xi) { + xfrm_state_put(x); + return -1; + } + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); + else + ipv4_redirect(skb, net, 0, 0, protocol, 0); + xfrm_state_put(x); + + return 0; +} + +static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + struct net *net = dev_net(skb->dev); + int protocol = iph->nexthdr; + struct ip_comp_hdr *ipch; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah; + struct xfrm_state *x; + struct xfrm_if *xi; + __be32 spi; + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data + offset); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data + offset); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data + offset); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET6); + if (!x) + return 0; + + xi = xfrmi_lookup(net, x); + if (!xi) { + xfrm_state_put(x); + return -1; + } + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + else + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); + xfrm_state_put(x); + + return 0; +} + +static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) +{ + if (xi->p.link != p->link) + return -EINVAL; + + xi->p.if_id = p->if_id; + + return 0; +} + +static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) +{ + struct net *net = dev_net(xi->dev); + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + int err; + + xfrmi_unlink(xfrmn, xi); + synchronize_net(); + err = xfrmi_change(xi, p); + xfrmi_link(xfrmn, xi); + netdev_state_change(xi->dev); + return err; +} + +static void xfrmi_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + int cpu; + + if (!dev->tstats) + return; + + for_each_possible_cpu(cpu) { + struct pcpu_sw_netstats *stats; + struct pcpu_sw_netstats tmp; + int start; + + stats = per_cpu_ptr(dev->tstats, cpu); + do { + start = u64_stats_fetch_begin_irq(&stats->syncp); + tmp.rx_packets = stats->rx_packets; + tmp.rx_bytes = stats->rx_bytes; + tmp.tx_packets = stats->tx_packets; + tmp.tx_bytes = stats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + + s->rx_packets += tmp.rx_packets; + s->rx_bytes += tmp.rx_bytes; + s->tx_packets += tmp.tx_packets; + s->tx_bytes += tmp.tx_bytes; + } + + s->rx_dropped = dev->stats.rx_dropped; + s->tx_dropped = dev->stats.tx_dropped; +} + +static int xfrmi_get_iflink(const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + return xi->phydev->ifindex; +} + + +static const struct net_device_ops xfrmi_netdev_ops = { + .ndo_init = xfrmi_dev_init, + .ndo_uninit = xfrmi_dev_uninit, + .ndo_start_xmit = xfrmi_xmit, + .ndo_get_stats64 = xfrmi_get_stats64, + .ndo_get_iflink = xfrmi_get_iflink, +}; + +static void xfrmi_dev_setup(struct net_device *dev) +{ + dev->netdev_ops = &xfrmi_netdev_ops; + dev->type = ARPHRD_NONE; + dev->hard_header_len = ETH_HLEN; + dev->min_header_len = ETH_HLEN; + dev->mtu = ETH_DATA_LEN; + dev->min_mtu = ETH_MIN_MTU; + dev->max_mtu = ETH_DATA_LEN; + dev->addr_len = ETH_ALEN; + dev->flags = IFF_NOARP; + dev->needs_free_netdev = true; + dev->priv_destructor = xfrmi_dev_free; + netif_keep_dst(dev); +} + +static int xfrmi_dev_init(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device *phydev = xi->phydev; + int err; + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + err = gro_cells_init(&xi->gro_cells, dev); + if (err) { + free_percpu(dev->tstats); + return err; + } + + dev->features |= NETIF_F_LLTX; + + dev->needed_headroom = phydev->needed_headroom; + dev->needed_tailroom = phydev->needed_tailroom; + + if (is_zero_ether_addr(dev->dev_addr)) + eth_hw_addr_inherit(dev, phydev); + if (is_zero_ether_addr(dev->broadcast)) + memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); + + return 0; +} + +static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void xfrmi_netlink_parms(struct nlattr *data[], + struct xfrm_if_parms *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_XFRM_LINK]) + parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); + + if (data[IFLA_XFRM_IF_ID]) + parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); +} + +static int xfrmi_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct net *net = dev_net(dev); + struct xfrm_if_parms *p; + struct xfrm_if *xi; + + xi = netdev_priv(dev); + p = &xi->p; + + xfrmi_netlink_parms(data, p); + + if (!tb[IFLA_IFNAME]) + return -EINVAL; + + nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ); + + if (!xfrmi_locate(net, p, 1)) + return -EEXIST; + + return 0; +} + +static void xfrmi_dellink(struct net_device *dev, struct list_head *head) +{ + unregister_netdevice_queue(dev, head); +} + +static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net *net = dev_net(dev); + + xfrmi_netlink_parms(data, &xi->p); + + xi = xfrmi_locate(net, &xi->p, 0); + + if (xi) { + if (xi->dev != dev) + return -EEXIST; + } else + xi = netdev_priv(dev); + + return xfrmi_update(xi, &xi->p); +} + +static size_t xfrmi_get_size(const struct net_device *dev) +{ + return + /* IFLA_XFRM_LINK */ + nla_total_size(4) + + /* IFLA_XFRM_IF_ID */ + nla_total_size(4) + + 0; +} + +static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct xfrm_if_parms *parm = &xi->p; + + if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || + nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +struct net *xfrmi_get_link_net(const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + return dev_net(xi->phydev); +} + +static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { + [IFLA_XFRM_LINK] = { .type = NLA_U32 }, + [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, +}; + +static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { + .kind = "xfrm", + .maxtype = IFLA_XFRM_MAX, + .policy = xfrmi_policy, + .priv_size = sizeof(struct xfrm_if), + .setup = xfrmi_dev_setup, + .validate = xfrmi_validate, + .newlink = xfrmi_newlink, + .dellink = xfrmi_dellink, + .changelink = xfrmi_changelink, + .get_size = xfrmi_get_size, + .fill_info = xfrmi_fill_info, + .get_link_net = xfrmi_get_link_net, +}; + +static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn) +{ + struct xfrm_if *xi; + LIST_HEAD(list); + + xi = rtnl_dereference(xfrmn->xfrmi[0]); + if (!xi) + return; + + unregister_netdevice_queue(xi->dev, &list); + unregister_netdevice_many(&list); +} + +static int __net_init xfrmi_init_net(struct net *net) +{ + return 0; +} + +static void __net_exit xfrmi_exit_net(struct net *net) +{ + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + + rtnl_lock(); + xfrmi_destroy_interfaces(xfrmn); + rtnl_unlock(); +} + +static struct pernet_operations xfrmi_net_ops = { + .init = xfrmi_init_net, + .exit = xfrmi_exit_net, + .id = &xfrmi_net_id, + .size = sizeof(struct xfrmi_net), +}; + +static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static int __init xfrmi4_init(void) +{ + int err; + + err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; + err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); + if (err < 0) + goto xfrm_proto_ah_failed; + err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); + if (err < 0) + goto xfrm_proto_comp_failed; + + return 0; + +xfrm_proto_comp_failed: + xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: + xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: + return err; +} + +static void xfrmi4_fini(void) +{ + xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); +} + +static int __init xfrmi6_init(void) +{ + int err; + + err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; + err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); + if (err < 0) + goto xfrm_proto_ah_failed; + err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); + if (err < 0) + goto xfrm_proto_comp_failed; + + return 0; + +xfrm_proto_comp_failed: + xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: + xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: + return err; +} + +static void xfrmi6_fini(void) +{ + xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); + xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); +} + +static const struct xfrm_if_cb xfrm_if_cb = { + .decode_session = xfrmi_decode_session, +}; + +static int __init xfrmi_init(void) +{ + const char *msg; + int err; + + pr_info("IPsec XFRM device driver\n"); + + msg = "tunnel device"; + err = register_pernet_device(&xfrmi_net_ops); + if (err < 0) + goto pernet_dev_failed; + + msg = "xfrm4 protocols"; + err = xfrmi4_init(); + if (err < 0) + goto xfrmi4_failed; + + msg = "xfrm6 protocols"; + err = xfrmi6_init(); + if (err < 0) + goto xfrmi6_failed; + + + msg = "netlink interface"; + err = rtnl_link_register(&xfrmi_link_ops); + if (err < 0) + goto rtnl_link_failed; + + xfrm_if_register_cb(&xfrm_if_cb); + + return err; + +rtnl_link_failed: + xfrmi6_fini(); +xfrmi6_failed: + xfrmi4_fini(); +xfrmi4_failed: + unregister_pernet_device(&xfrmi_net_ops); +pernet_dev_failed: + pr_err("xfrmi init: failed to register %s\n", msg); + return err; +} + +static void __exit xfrmi_fini(void) +{ + xfrm_if_unregister_cb(); + rtnl_link_unregister(&xfrmi_link_ops); + xfrmi4_fini(); + xfrmi6_fini(); + unregister_pernet_device(&xfrmi_net_ops); +} + +module_init(xfrmi_init); +module_exit(xfrmi_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("xfrm"); +MODULE_ALIAS_NETDEV("xfrm0"); +MODULE_AUTHOR("Steffen Klassert"); +MODULE_DESCRIPTION("XFRM virtual interface"); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index a3cd67ca24eb..17a6c5b6c5a6 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -47,6 +47,9 @@ struct xfrm_flo { static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst); static struct work_struct *xfrm_pcpu_work __read_mostly; +static DEFINE_SPINLOCK(xfrm_if_cb_lock); +static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; + static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; @@ -119,6 +122,12 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa return afinfo; } +/* Called with rcu_read_lock(). */ +static const struct xfrm_if_cb *xfrm_if_get_cb(void) +{ + return rcu_dereference(xfrm_if_cb); +} + struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, @@ -2082,6 +2091,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, if (IS_ERR(xdst)) { err = PTR_ERR(xdst); + if (err == -EREMOTE) { + xfrm_pols_put(pols, num_pols); + return NULL; + } + if (err != -EAGAIN) goto error; goto make_dummy_bundle; @@ -2175,6 +2189,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); + if (err == -EREMOTE) + goto nopol; + goto dropdst; } else if (xdst == NULL) { num_xfrms = 0; @@ -2367,12 +2384,20 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + const struct xfrm_if_cb *ifcb = xfrm_if_get_cb(); + struct xfrm_if *xi; int err; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; afinfo->decode_session(skb, fl, reverse); + if (ifcb) { + xi = ifcb->decode_session(skb); + if (xi) + fl->flowi_xfrm.if_id = xi->p.if_id; + } + err = security_xfrm_decode_session(skb, &fl->flowi_secid); rcu_read_unlock(); return err; @@ -2822,6 +2847,21 @@ void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); +void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb) +{ + spin_lock(&xfrm_if_cb_lock); + rcu_assign_pointer(xfrm_if_cb, ifcb); + spin_unlock(&xfrm_if_cb_lock); +} +EXPORT_SYMBOL(xfrm_if_register_cb); + +void xfrm_if_unregister_cb(void) +{ + RCU_INIT_POINTER(xfrm_if_cb, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL(xfrm_if_unregister_cb); + #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { @@ -3003,6 +3043,9 @@ void __init xfrm_init(void) register_pernet_subsys(&xfrm_net_ops); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); + + RCU_INIT_POINTER(xfrm_if_cb, NULL); + synchronize_rcu(); } #ifdef CONFIG_AUDITSYSCALL From 79dd401f8e637f137975ca55a12f21e09ccc4d81 Mon Sep 17 00:00:00 2001 From: Nathan Harold Date: Fri, 29 Jun 2018 15:07:10 -0700 Subject: [PATCH 26/37] UPSTREAM: xfrm: Allow Set Mark to be Updated Using UPDSA Allow UPDSA to change "set mark" to permit policy separation of packet routing decisions from SA keying in systems that use mark-based routing. The set mark, used as a routing and firewall mark for outbound packets, is made update-able which allows routing decisions to be handled independently of keying/SA creation. To maintain consistency with other optional attributes, the set mark is only updated if sent with a non-zero value. The per-SA lock and the xfrm_state_lock are taken in that order to avoid a deadlock with xfrm_timer_handler(), which also takes the locks in that order. Signed-off-by: Nathan Harold Signed-off-by: Steffen Klassert (cherry picked from commit 6d8e85ffe17895d7bc632dfbaa9e2e33b22fe873) Signed-off-by: Benedict Wong Bug: 113046120 Change-Id: Ie7ab34ca38aedb034cf5aa83457c552c43f4f566 --- net/xfrm/xfrm_state.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index a065ca04e4f1..3d6dab2dd3cd 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1559,6 +1559,15 @@ out: if (x1->curlft.use_time) xfrm_state_check_expire(x1); + if (x->props.smark.m || x->props.smark.v) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); + + x1->props.smark = x->props.smark; + + __xfrm_state_bump_genids(x1); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + } + err = 0; x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); From 0e1ddf08e1f6dd2f88087156fa3dc05b8284239c Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Thu, 19 Jul 2018 10:50:44 -0700 Subject: [PATCH 27/37] UPSTREAM: xfrm: Remove xfrmi interface ID from flowi In order to remove performance impact of having the extra u32 in every single flowi, this change removes the flowi_xfrm struct, prefering to take the if_id as a method parameter where needed. In the inbound direction, if_id is only needed during the __xfrm_check_policy() function, and the if_id can be determined at that point based on the skb. As such, xfrmi_decode_session() is only called with the skb in __xfrm_check_policy(). In the outbound direction, the only place where if_id is needed is the xfrm_lookup() call in xfrmi_xmit2(). With this change, the if_id is directly passed into the xfrm_lookup_with_ifid() call. All existing callers can still call xfrm_lookup(), which uses a default if_id of 0. This change does not change any behavior of XFRMIs except for improving overall system performance via flowi size reduction. This change has been tested against the Android Kernel Networking Tests: https://android.googlesource.com/kernel/tests/+/master/net/test Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert (cherry picked from commit bc56b33404599edc412b91933d74b36873e8ea25) Bug: 113046120 Change-Id: Icd3a1ea08427b91c54a64318d9dbb9acfb5d429a --- include/net/dst.h | 14 ++++++ include/net/flow.h | 9 ---- include/net/xfrm.h | 2 +- net/xfrm/xfrm_interface.c | 4 +- net/xfrm/xfrm_policy.c | 98 ++++++++++++++++++++++++++------------- net/xfrm/xfrm_state.c | 3 +- 6 files changed, 83 insertions(+), 47 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index ebfb4328fdb1..cf7ce4fdfa40 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -490,6 +490,14 @@ static inline struct dst_entry *xfrm_lookup(struct net *net, return dst_orig; } +static inline struct dst_entry * +xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, const struct sock *sk, + int flags, u32 if_id) +{ + return dst_orig; +} + static inline struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, @@ -509,6 +517,12 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); +struct dst_entry *xfrm_lookup_with_ifid(struct net *net, + struct dst_entry *dst_orig, + const struct flowi *fl, + const struct sock *sk, int flags, + u32 if_id); + struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); diff --git a/include/net/flow.h b/include/net/flow.h index 6c2272213a6b..f1624fd5b1d0 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -26,10 +26,6 @@ struct flowi_tunnel { __be64 tun_id; }; -struct flowi_xfrm { - __u32 if_id; -}; - struct flowi_common { int flowic_oif; int flowic_iif; @@ -43,7 +39,6 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; - struct flowi_xfrm xfrm; kuid_t flowic_uid; }; @@ -83,7 +78,6 @@ struct flowi4 { #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid -#define flowi4_xfrm __fl_common.xfrm /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -115,7 +109,6 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; - fl4->flowi4_xfrm.if_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; @@ -145,7 +138,6 @@ struct flowi6 { #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid -#define flowi6_xfrm __fl_common.xfrm struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ @@ -193,7 +185,6 @@ struct flowi { #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid -#define flowi_xfrm u.__fl_common.xfrm } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2ee065ec429c..d290de00670e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1526,7 +1526,7 @@ struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, - unsigned short family); + unsigned short family, u32 if_id); struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 31cb1c7e3881..ccfe18d67e98 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -307,10 +307,8 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (!dst) goto tx_err_link_failure; - fl->flowi_xfrm.if_id = xi->p.if_id; - dst_hold(dst); - dst = xfrm_lookup(xi->net, dst, fl, NULL, 0); + dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 17a6c5b6c5a6..d68d56ec1315 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1071,14 +1071,14 @@ EXPORT_SYMBOL(xfrm_policy_walk_done); */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, - u8 type, u16 family, int dir) + u8 type, u16 family, int dir, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; bool match; if (pol->family != family || - pol->if_id != fl->flowi_xfrm.if_id || + pol->if_id != if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; @@ -1093,7 +1093,8 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, - u16 family, u8 dir) + u16 family, u8 dir, + u32 if_id) { int err; struct xfrm_policy *pol, *ret; @@ -1117,7 +1118,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, priority = ~0U; ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { - err = xfrm_policy_match(pol, fl, type, family, dir); + err = xfrm_policy_match(pol, fl, type, family, dir, if_id); if (err) { if (err == -ESRCH) continue; @@ -1136,7 +1137,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, if ((pol->priority >= priority) && ret) break; - err = xfrm_policy_match(pol, fl, type, family, dir); + err = xfrm_policy_match(pol, fl, type, family, dir, if_id); if (err) { if (err == -ESRCH) continue; @@ -1161,21 +1162,25 @@ fail: return ret; } -static struct xfrm_policy * -xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir) +static struct xfrm_policy *xfrm_policy_lookup(struct net *net, + const struct flowi *fl, + u16 family, u8 dir, u32 if_id) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; - pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); + pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, + dir, if_id); if (pol != NULL) return pol; #endif - return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); + return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, + dir, if_id); } static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, - const struct flowi *fl, u16 family) + const struct flowi *fl, + u16 family, u32 if_id) { struct xfrm_policy *pol; @@ -1194,7 +1199,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((sk->sk_mark & pol->mark.m) != pol->mark.v || - pol->if_id != fl->flowi_xfrm.if_id) { + pol->if_id != if_id) { pol = NULL; goto out; } @@ -1408,7 +1413,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, } } - x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family); + x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, + family, policy->if_id); if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; @@ -1711,7 +1717,8 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), XFRM_POLICY_TYPE_MAIN, fl, family, - XFRM_POLICY_OUT); + XFRM_POLICY_OUT, + pols[0]->if_id); if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); @@ -2064,8 +2071,10 @@ free_dst: goto out; } -static struct xfrm_dst * -xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo) +static struct xfrm_dst *xfrm_bundle_lookup(struct net *net, + const struct flowi *fl, + u16 family, u8 dir, + struct xfrm_flo *xflo, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols = 0, num_xfrms = 0, err; @@ -2074,7 +2083,7 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, /* Resolve policies to use if we couldn't get them from * previous cache entry */ num_pols = 1; - pols[0] = xfrm_policy_lookup(net, fl, family, dir); + pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) @@ -2145,14 +2154,19 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family, return ret; } -/* Main function: finds/creates a bundle for given flow. +/* Finds/creates a bundle for given flow and if_id * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. + * + * xfrm_lookup uses an if_id of 0 by default, and is provided for + * compatibility */ -struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, - const struct flowi *fl, - const struct sock *sk, int flags) +struct dst_entry *xfrm_lookup_with_ifid(struct net *net, + struct dst_entry *dst_orig, + const struct flowi *fl, + const struct sock *sk, + int flags, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst; @@ -2168,7 +2182,8 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; - pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family); + pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family, + if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) @@ -2214,7 +2229,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, !net->xfrm.policy_count[XFRM_POLICY_OUT]) goto nopol; - xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo); + xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); if (xdst == NULL) goto nopol; if (IS_ERR(xdst)) { @@ -2295,6 +2310,19 @@ dropdst: xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } +EXPORT_SYMBOL(xfrm_lookup_with_ifid); + +/* Main function: finds/creates a bundle for given flow. + * + * At the moment we eat a raw IP route. Mostly to speed up lookups + * on interfaces with disabled IPsec. + */ +struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, const struct sock *sk, + int flags) +{ + return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0); +} EXPORT_SYMBOL(xfrm_lookup); /* Callers of xfrm_lookup_route() must ensure a call to dst_output(). @@ -2384,19 +2412,12 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - const struct xfrm_if_cb *ifcb = xfrm_if_get_cb(); - struct xfrm_if *xi; int err; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; afinfo->decode_session(skb, fl, reverse); - if (ifcb) { - xi = ifcb->decode_session(skb); - if (xi) - fl->flowi_xfrm.if_id = xi->p.if_id; - } err = security_xfrm_decode_session(skb, &fl->flowi_secid); rcu_read_unlock(); @@ -2428,6 +2449,19 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, int reverse; struct flowi fl; int xerr_idx = -1; + const struct xfrm_if_cb *ifcb; + struct xfrm_if *xi; + u32 if_id = 0; + + rcu_read_lock(); + ifcb = xfrm_if_get_cb(); + + if (ifcb) { + xi = ifcb->decode_session(skb); + if (xi) + if_id = xi->p.if_id; + } + rcu_read_unlock(); reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; @@ -2455,7 +2489,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, pol = NULL; sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { - pol = xfrm_sk_policy_lookup(sk, dir, &fl, family); + pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; @@ -2463,7 +2497,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } if (!pol) - pol = xfrm_policy_lookup(net, &fl, family, dir); + pol = xfrm_policy_lookup(net, &fl, family, dir, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); @@ -2487,7 +2521,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, &fl, family, - XFRM_POLICY_IN); + XFRM_POLICY_IN, if_id); if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 3d6dab2dd3cd..11c07c5fec23 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -931,7 +931,7 @@ struct xfrm_state * xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, - unsigned short family) + unsigned short family, u32 if_id) { static xfrm_address_t saddr_wildcard = { }; struct net *net = xp_net(pol); @@ -941,7 +941,6 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, int error = 0; struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; - u32 if_id = fl->flowi_xfrm.if_id; unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; From ac346daa945a49aed03e2dc48b2c5ca8f188c431 Mon Sep 17 00:00:00 2001 From: Nathan Harold Date: Thu, 19 Jul 2018 19:07:47 -0700 Subject: [PATCH 28/37] UPSTREAM: xfrm: Allow xfrmi if_id to be updated by UPDSA Allow attaching an SA to an xfrm interface id after the creation of the SA, so that tasks such as keying which must be done as the SA is created, can remain separate from the decision on how to route traffic from an SA. This permits SA creation to be decomposed in to three separate steps: 1) allocation of a SPI 2) algorithm and key negotiation 3) insertion into the data path Signed-off-by: Nathan Harold Signed-off-by: Steffen Klassert (cherry picked from commit 5baf4f9c0035f3e33bb693a1a1e87599f6e804e6) Signed-off-by: Benedict Wong Bug: 113046120 Change-Id: I45a4bf725f3b8eaa8dae3266f9c411febb4c8720 --- net/xfrm/xfrm_state.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 11c07c5fec23..5e53ad67ece7 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1558,10 +1558,14 @@ out: if (x1->curlft.use_time) xfrm_state_check_expire(x1); - if (x->props.smark.m || x->props.smark.v) { + if (x->props.smark.m || x->props.smark.v || x->if_id) { spin_lock_bh(&net->xfrm.xfrm_state_lock); - x1->props.smark = x->props.smark; + if (x->props.smark.m || x->props.smark.v) + x1->props.smark = x->props.smark; + + if (x->if_id) + x1->if_id = x->if_id; __xfrm_state_bump_genids(x1); spin_unlock_bh(&net->xfrm.xfrm_state_lock); From 4a33ca9e019dd3c73bfefafb692e2c20e59bcfd0 Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Wed, 25 Jul 2018 13:45:29 -0700 Subject: [PATCH 29/37] UPSTREAM: xfrm: Return detailed errors from xfrmi_newlink Currently all failure modes of xfrm interface creation return EEXIST. This change improves the granularity of errnos provided by also returning ENODEV or EINVAL if failures happen in looking up the underlying interface, or a required parameter is not provided. This change has been tested against the Android Kernel Networking Tests, with additional xfrmi_newlink tests here: https://android-review.googlesource.com/c/kernel/tests/+/715755 Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert (cherry picked from commit 44e2b838c24d883dae8496dc7b6ddac7956ba53c) Bug: 113046120 Change-Id: Ic680bf1e4a828aaae01b289223d9396a551eefd2 --- net/xfrm/xfrm_interface.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index ccfe18d67e98..481d7307ab51 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -149,14 +149,18 @@ static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p) char name[IFNAMSIZ]; int err; - if (p->name[0]) + if (p->name[0]) { strlcpy(name, p->name, IFNAMSIZ); - else + } else { + err = -EINVAL; goto failed; + } dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup); - if (!dev) + if (!dev) { + err = -EAGAIN; goto failed; + } dev_net_set(dev, net); @@ -165,8 +169,10 @@ static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p) xi->net = net; xi->dev = dev; xi->phydev = dev_get_by_index(net, p->link); - if (!xi->phydev) + if (!xi->phydev) { + err = -ENODEV; goto failed_free; + } err = xfrmi_create2(dev); if (err < 0) @@ -179,7 +185,7 @@ failed_dev_put: failed_free: free_netdev(dev); failed: - return NULL; + return ERR_PTR(err); } static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p, @@ -194,13 +200,13 @@ static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p, xip = &xi->next) { if (xi->p.if_id == p->if_id) { if (create) - return NULL; + return ERR_PTR(-EEXIST); return xi; } } if (!create) - return NULL; + return ERR_PTR(-ENODEV); return xfrmi_create(net, p); } @@ -682,8 +688,9 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ); - if (!xfrmi_locate(net, p, 1)) - return -EEXIST; + xi = xfrmi_locate(net, p, 1); + if (IS_ERR(xi)) + return PTR_ERR(xi); return 0; } @@ -704,11 +711,12 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], xi = xfrmi_locate(net, &xi->p, 0); - if (xi) { + if (IS_ERR_OR_NULL(xi)) { + xi = netdev_priv(dev); + } else { if (xi->dev != dev) return -EEXIST; - } else - xi = netdev_priv(dev); + } return xfrmi_update(xi, &xi->p); } From 36b4801b9aad3c03b64b5f46c6791ec1730817ca Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 26 Jul 2018 15:09:52 +0800 Subject: [PATCH 30/37] UPSTREAM: xfrm: fix ptr_ret.cocci warnings net/xfrm/xfrm_interface.c:692:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Fixes: 44e2b838c24d ("xfrm: Return detailed errors from xfrmi_newlink") CC: Benedict Wong Signed-off-by: kbuild test robot Signed-off-by: Steffen Klassert (cherry picked from commit c6f5e017df9dfa9f6cbe70da008e7d716d726f1b) Signed-off-by: Benedict Wong Bug: 113046120 Test: All kernel net-tests run, passing (20x repeated) Change-Id: I4ec93c0427fded57ff5126dc7b3d97d9b5fd615b --- net/xfrm/xfrm_interface.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 481d7307ab51..31acc6f33d98 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -689,10 +689,7 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ); xi = xfrmi_locate(net, p, 1); - if (IS_ERR(xi)) - return PTR_ERR(xi); - - return 0; + return PTR_ERR_OR_ZERO(xi); } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) From b859aa7d7a0c5eb257b03b08da5a71cec7e61e3c Mon Sep 17 00:00:00 2001 From: Sumit Semwal Date: Wed, 5 Sep 2018 20:48:15 +0530 Subject: [PATCH 31/37] ANDROID: squashfs: resolve merge conflict with 4.14.68 Commit a3f94cb99a854fa381fe7fadd97c4f61633717a5 upstream changes the way expected length is calculated for squashfs_readpage. This got merged to stable 4.14.68. Android adds .readpages path to make things faster, but the above commit conflicts with it. We try to adapt as close to upstream as possible, while not touching the squashfs_readpages_block() mechanism. Signed-off-by: Sumit Semwal Signed-off-by: Greg Kroah-Hartman --- fs/squashfs/file.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 59254a7bf8f0..9236e7d45eb3 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -453,7 +453,8 @@ static int squashfs_readpage_fragment(struct page *page, int expected) } static int squashfs_readpages_fragment(struct page *page, - struct list_head *readahead_pages, struct address_space *mapping) + struct list_head *readahead_pages, struct address_space *mapping, + int expected) { if (!page) { page = lru_to_page(readahead_pages); @@ -464,18 +465,18 @@ static int squashfs_readpages_fragment(struct page *page, return 0; } } - return squashfs_readpage_fragment(page); + return squashfs_readpage_fragment(page, expected); } -static int squashfs_readpage_sparse(struct page *page, int index, int file_end) +static int squashfs_readpage_sparse(struct page *page, int expected) { squashfs_copy_cache(page, NULL, expected, 0); return 0; } static int squashfs_readpages_sparse(struct page *page, - struct list_head *readahead_pages, int index, int file_end, - struct address_space *mapping) + struct list_head *readahead_pages, struct address_space *mapping, + int expected) { if (!page) { page = lru_to_page(readahead_pages); @@ -486,7 +487,7 @@ static int squashfs_readpages_sparse(struct page *page, return 0; } } - return squashfs_readpage_sparse(page, index, file_end); + return squashfs_readpage_sparse(page, expected); } static int __squashfs_readpages(struct file *file, struct page *page, @@ -496,9 +497,6 @@ static int __squashfs_readpages(struct file *file, struct page *page, struct inode *inode = mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; int file_end = i_size_read(inode) >> msblk->block_log; - int expected = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; int res; do { @@ -506,6 +504,9 @@ static int __squashfs_readpages(struct file *file, struct page *page, : lru_to_page(readahead_pages); int page_index = cur_page->index; int index = page_index >> (msblk->block_log - PAGE_SHIFT); + int expected = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; if (page_index >= ((i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT)) @@ -521,8 +522,7 @@ static int __squashfs_readpages(struct file *file, struct page *page, if (bsize == 0) { res = squashfs_readpages_sparse(page, - readahead_pages, index, file_end, - mapping); + readahead_pages, mapping, expected); } else { res = squashfs_readpages_block(page, readahead_pages, &nr_pages, mapping, @@ -530,7 +530,7 @@ static int __squashfs_readpages(struct file *file, struct page *page, } } else { res = squashfs_readpages_fragment(page, - readahead_pages, mapping); + readahead_pages, mapping, expected); } if (res) return 0; From 666c420fa3ea13f407550576c08f592d0f7d8202 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Sat, 25 Aug 2018 13:50:56 -0700 Subject: [PATCH 32/37] FROMLIST: ANDROID: binder: Add BINDER_GET_NODE_INFO_FOR_REF ioctl. This allows the context manager to retrieve information about nodes that it holds a reference to, such as the current number of references to those nodes. Such information can for example be used to determine whether the servicemanager is the only process holding a reference to a node. This information can then be passed on to the process holding the node, which can in turn decide whether it wants to shut down to reduce resource usage. Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 55 +++++++++++++++++++++++++++++ include/uapi/linux/android/binder.h | 10 ++++++ 2 files changed, 65 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 9c06e7f46d7f..dcd75ed81d00 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4712,6 +4712,42 @@ out: return ret; } +static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc, + struct binder_node_info_for_ref *info) +{ + struct binder_node *node; + struct binder_context *context = proc->context; + __u32 handle = info->handle; + + if (info->strong_count || info->weak_count || info->reserved1 || + info->reserved2 || info->reserved3) { + binder_user_error("%d BINDER_GET_NODE_INFO_FOR_REF: only handle may be non-zero.", + proc->pid); + return -EINVAL; + } + + /* This ioctl may only be used by the context manager */ + mutex_lock(&context->context_mgr_node_lock); + if (!context->binder_context_mgr_node || + context->binder_context_mgr_node->proc != proc) { + mutex_unlock(&context->context_mgr_node_lock); + return -EPERM; + } + mutex_unlock(&context->context_mgr_node_lock); + + node = binder_get_node_from_ref(proc, handle, true, NULL); + if (!node) + return -EINVAL; + + info->strong_count = node->local_strong_refs + + node->internal_strong_refs; + info->weak_count = node->local_weak_refs; + + binder_put_node(node); + + return 0; +} + static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, struct binder_node_debug_info *info) { @@ -4806,6 +4842,25 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } break; } + case BINDER_GET_NODE_INFO_FOR_REF: { + struct binder_node_info_for_ref info; + + if (copy_from_user(&info, ubuf, sizeof(info))) { + ret = -EFAULT; + goto err; + } + + ret = binder_ioctl_get_node_info_for_ref(proc, &info); + if (ret < 0) + goto err; + + if (copy_to_user(ubuf, &info, sizeof(info))) { + ret = -EFAULT; + goto err; + } + + break; + } case BINDER_GET_NODE_DEBUG_INFO: { struct binder_node_debug_info info; diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index b4723e36b6cf..4a1c285b97b1 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -247,6 +247,15 @@ struct binder_node_debug_info { __u32 has_weak_ref; }; +struct binder_node_info_for_ref { + __u32 handle; + __u32 strong_count; + __u32 weak_count; + __u32 reserved1; + __u32 reserved2; + __u32 reserved3; +}; + #define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) #define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) #define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) @@ -255,6 +264,7 @@ struct binder_node_debug_info { #define BINDER_THREAD_EXIT _IOW('b', 8, __s32) #define BINDER_VERSION _IOWR('b', 9, struct binder_version) #define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info) +#define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref) /* * NOTE: Two special error codes you should check for when calling From e709f59a8cd2405f59451073c58b3e335c643b27 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 26 Sep 2018 13:48:19 -0700 Subject: [PATCH 33/37] ANDROID: restrict store of prefer_idle as boolean It works as boolean so stores like a boolean too. Bug: 116734731 Test: Set stune Change-Id: I0daa3cc1723d009ed5bc2a71fa1c2e3d4ece6a7f Signed-off-by: Wei Wang --- kernel/sched/tune.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 765dcd46d468..74a45606dc8c 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -455,7 +455,7 @@ prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 prefer_idle) { struct schedtune *st = css_st(css); - st->prefer_idle = prefer_idle; + st->prefer_idle = !!prefer_idle; return 0; } From 5494a93da34671aa6b6f8b1e00b9b97778920692 Mon Sep 17 00:00:00 2001 From: Blagovest Kolenichev Date: Tue, 2 Apr 2019 01:55:44 -0700 Subject: [PATCH 34/37] Revert "ANDROID: sched: Add support for frequency/power energy model" This reverts commit 9001b8f1a599b974ca8f2f925c210a31c0c70d60. This is a preparation change for merging android-4.14 commit e709f59a8cd2405f59451073c58b3e335c643b27 into msm-4.14 branch. The reverted change is rejected in msm-4.14 branch since it does not bring in any fixes or improvements to the energy model. Moreover the later kernels have moved on to use simplified energy model. Change-Id: I0005fef09e52238070d507e93c26bc69d1e82ec9 Signed-off-by: Blagovest Kolenichev --- .../bindings/scheduler/sched-energy-costs.txt | 5 - include/linux/sched/topology.h | 2 - kernel/sched/energy.c | 176 +----------------- kernel/sched/fair.c | 2 +- 4 files changed, 7 insertions(+), 178 deletions(-) diff --git a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt index 447cc32b3f21..2ceb202462cb 100644 --- a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt +++ b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt @@ -237,11 +237,6 @@ Property added to the cpu node: Any other configuration is invalid. -- freq-energy-model - Description: Optional. Must be declared if the energy model - represents frequency/power values. If absent, energy model is - by default considered as capacity/power. - =========================================================== 5 - Example dts =========================================================== diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 5fcb7dd36048..e0161c3da0da 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -69,7 +69,6 @@ extern int sched_domain_level_max; struct capacity_state { unsigned long cap; /* compute capacity */ - unsigned long frequency;/* frequency */ unsigned long power; /* power consumption at this compute capacity */ }; @@ -195,7 +194,6 @@ typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); typedef const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu); -extern bool energy_aware(void); #define SDTL_OVERLAP 0x01 diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c index 8342de041e11..be470c695293 100644 --- a/kernel/sched/energy.c +++ b/kernel/sched/energy.c @@ -18,6 +18,8 @@ */ #define pr_fmt(fmt) "sched-energy: " fmt +#define DEBUG + #include #include #include @@ -26,11 +28,6 @@ #include #include #include -#include -#include -#include - -#include "sched.h" struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; @@ -50,8 +47,6 @@ static void free_resources(void) } } } -static bool sge_ready; -static bool freq_energy_model; void check_max_cap_vs_cpu_scale(int cpu, struct sched_group_energy *sge) { @@ -88,9 +83,6 @@ void init_sched_energy_costs(void) pr_warn("CPU device node has no sched-energy-costs\n"); return; } - /* Check if the energy model contains frequency/power values */ - if (of_find_property(cn, "freq-energy-model", NULL)) - freq_energy_model = true; for_each_possible_sd_level(sd_level) { cp = of_parse_phandle(cn, "sched-energy-costs", sd_level); @@ -112,18 +104,7 @@ void init_sched_energy_costs(void) GFP_NOWAIT); for (i = 0, val = prop->value; i < nstates; i++) { - if (freq_energy_model) { - /* - * Capacity values will be calculated later using - * frequency reported by OPP driver and cpu_uarch_scale - * values. - */ - cap_states[i].frequency = be32_to_cpup(val++); - cap_states[i].cap = 0; - } else { - cap_states[i].frequency = 0; - cap_states[i].cap = be32_to_cpup(val++); - } + cap_states[i].cap = be32_to_cpup(val++); cap_states[i].power = be32_to_cpup(val++); } @@ -149,158 +130,13 @@ void init_sched_energy_costs(void) sge_array[cpu][sd_level] = sge; } - if (!freq_energy_model) - check_max_cap_vs_cpu_scale(cpu, sge_array[cpu][SD_LEVEL0]); + + check_max_cap_vs_cpu_scale(cpu, sge_array[cpu][SD_LEVEL0]); } - sge_ready = true; + pr_info("Sched-energy-costs installed from DT\n"); return; out: free_resources(); } - -static int sched_energy_probe(struct platform_device *pdev) -{ - int cpu; - unsigned long *max_frequencies = NULL; - int ret; - - if (!sge_ready) - return -EPROBE_DEFER; - - if (!energy_aware() || !freq_energy_model) - return 0; - - max_frequencies = kmalloc_array(nr_cpu_ids, sizeof(unsigned long), - GFP_KERNEL); - if (!max_frequencies) { - ret = -ENOMEM; - goto exit; - } - - /* - * Find system max possible frequency and max frequencies for each - * CPUs. - */ - for_each_possible_cpu(cpu) { - struct device *cpu_dev; - struct dev_pm_opp *opp; - - cpu_dev = get_cpu_device(cpu); - if (IS_ERR_OR_NULL(cpu_dev)) { - if (!cpu_dev) - ret = -EINVAL; - else - ret = PTR_ERR(cpu_dev); - goto exit; - } - - max_frequencies[cpu] = ULONG_MAX; - - opp = dev_pm_opp_find_freq_floor(cpu_dev, - &max_frequencies[cpu]); - if (IS_ERR_OR_NULL(opp)) { - if (!opp || PTR_ERR(opp) == -ENODEV) - ret = -EPROBE_DEFER; - else - ret = PTR_ERR(opp); - goto exit; - } - - /* Convert HZ to KHZ */ - max_frequencies[cpu] /= 1000; - } - - /* update capacity in energy model */ - for_each_possible_cpu(cpu) { - unsigned long cpu_max_cap; - struct sched_group_energy *sge_l0, *sge; - cpu_max_cap = topology_get_cpu_scale(NULL, cpu); - - /* - * All the cap_states have same frequency table so use - * SD_LEVEL0's. - */ - sge_l0 = sge_array[cpu][SD_LEVEL0]; - if (sge_l0 && sge_l0->nr_cap_states > 0) { - int i; - int ncapstates = sge_l0->nr_cap_states; - - for (i = 0; i < ncapstates; i++) { - int sd_level; - unsigned long freq, cap; - - /* - * Energy model can contain more frequency - * steps than actual for multiple speedbin - * support. Ceil the max capacity with actual - * one. - */ - freq = min(sge_l0->cap_states[i].frequency, - max_frequencies[cpu]); - cap = DIV_ROUND_UP(cpu_max_cap * freq, - max_frequencies[cpu]); - - for_each_possible_sd_level(sd_level) { - sge = sge_array[cpu][sd_level]; - if (!sge) - break; - sge->cap_states[i].cap = cap; - } - dev_dbg(&pdev->dev, - "cpu=%d freq=%ld cap=%ld power_d0=%ld\n", - cpu, freq, sge_l0->cap_states[i].cap, - sge_l0->cap_states[i].power); - } - - dev_info(&pdev->dev, - "cpu=%d [freq=%ld cap=%ld power_d0=%ld] -> [freq=%ld cap=%ld power_d0=%ld]\n", - cpu, - sge_l0->cap_states[0].frequency, - sge_l0->cap_states[0].cap, - sge_l0->cap_states[0].power, - sge_l0->cap_states[ncapstates - 1].frequency, - sge_l0->cap_states[ncapstates - 1].cap, - sge_l0->cap_states[ncapstates - 1].power - ); - } - } - - kfree(max_frequencies); - dev_info(&pdev->dev, "Sched-energy-costs capacity updated\n"); - return 0; - -exit: - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, "error=%d\n", ret); - kfree(max_frequencies); - return ret; -} - -static struct platform_driver energy_driver = { - .driver = { - .name = "sched-energy", - }, - .probe = sched_energy_probe, -}; - -static struct platform_device energy_device = { - .name = "sched-energy", -}; - -static int __init sched_energy_init(void) -{ - int ret; - - ret = platform_device_register(&energy_device); - if (ret) - pr_err("%s device_register failed:%d\n", __func__, ret); - ret = platform_driver_register(&energy_driver); - if (ret) { - pr_err("%s driver_register failed:%d\n", __func__, ret); - platform_device_unregister(&energy_device); - } - return ret; -} -subsys_initcall(sched_energy_init); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 85b305de2035..a434181f3d00 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5688,7 +5688,7 @@ unsigned long capacity_curr_of(int cpu) return cap_scale(max_cap, scale_freq); } -inline bool energy_aware(void) +static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); } From e80ead3cf05b24666b8853405b540b6fb573a657 Mon Sep 17 00:00:00 2001 From: Blagovest Kolenichev Date: Tue, 2 Apr 2019 02:28:55 -0700 Subject: [PATCH 35/37] Revert "FROMLIST: ANDROID: binder: Add BINDER_GET_NODE_INFO_FOR_REF ioctl." This reverts commit 666c420fa3ea13f407550576c08f592d0f7d8202. This is a preparation change for merging android-4.14 commit e709f59a8cd2405f59451073c58b3e335c643b27 into msm-4.14 branch. The reverted change is committed already as: eec1c280b0 FROMLIST: ANDROID: binder: Add BINDER_GET_NODE_INFO_FOR_REF ioctl. Change-Id: I284da4503a16c0aeded90652b550dc02c8591615 Signed-off-by: Blagovest Kolenichev --- drivers/android/binder.c | 55 ----------------------------- include/uapi/linux/android/binder.h | 10 ------ 2 files changed, 65 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index dcd75ed81d00..9c06e7f46d7f 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4712,42 +4712,6 @@ out: return ret; } -static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc, - struct binder_node_info_for_ref *info) -{ - struct binder_node *node; - struct binder_context *context = proc->context; - __u32 handle = info->handle; - - if (info->strong_count || info->weak_count || info->reserved1 || - info->reserved2 || info->reserved3) { - binder_user_error("%d BINDER_GET_NODE_INFO_FOR_REF: only handle may be non-zero.", - proc->pid); - return -EINVAL; - } - - /* This ioctl may only be used by the context manager */ - mutex_lock(&context->context_mgr_node_lock); - if (!context->binder_context_mgr_node || - context->binder_context_mgr_node->proc != proc) { - mutex_unlock(&context->context_mgr_node_lock); - return -EPERM; - } - mutex_unlock(&context->context_mgr_node_lock); - - node = binder_get_node_from_ref(proc, handle, true, NULL); - if (!node) - return -EINVAL; - - info->strong_count = node->local_strong_refs + - node->internal_strong_refs; - info->weak_count = node->local_weak_refs; - - binder_put_node(node); - - return 0; -} - static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, struct binder_node_debug_info *info) { @@ -4842,25 +4806,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } break; } - case BINDER_GET_NODE_INFO_FOR_REF: { - struct binder_node_info_for_ref info; - - if (copy_from_user(&info, ubuf, sizeof(info))) { - ret = -EFAULT; - goto err; - } - - ret = binder_ioctl_get_node_info_for_ref(proc, &info); - if (ret < 0) - goto err; - - if (copy_to_user(ubuf, &info, sizeof(info))) { - ret = -EFAULT; - goto err; - } - - break; - } case BINDER_GET_NODE_DEBUG_INFO: { struct binder_node_debug_info info; diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 4a1c285b97b1..b4723e36b6cf 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -247,15 +247,6 @@ struct binder_node_debug_info { __u32 has_weak_ref; }; -struct binder_node_info_for_ref { - __u32 handle; - __u32 strong_count; - __u32 weak_count; - __u32 reserved1; - __u32 reserved2; - __u32 reserved3; -}; - #define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) #define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) #define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) @@ -264,7 +255,6 @@ struct binder_node_info_for_ref { #define BINDER_THREAD_EXIT _IOW('b', 8, __s32) #define BINDER_VERSION _IOWR('b', 9, struct binder_version) #define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info) -#define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref) /* * NOTE: Two special error codes you should check for when calling From 7d77e18d6e041d69a08a5a5dcd32821482954dde Mon Sep 17 00:00:00 2001 From: Blagovest Kolenichev Date: Tue, 2 Apr 2019 14:00:38 -0700 Subject: [PATCH 36/37] Revert "UPSTREAM: zram: set BDI_CAP_STABLE_WRITES once" This reverts commit 7b4e1b649ca74dc118c8d355719237f4dd886fbc. This is a preparation change for merging android-4.14 commit e709f59a8cd2405f59451073c58b3e335c643b27 into msm-4.14 branch. The reverted change is committed already as: 3694ff6b80 zram: set BDI_CAP_STABLE_WRITES once Change-Id: Id7d4ac9130c4d727df64e9887993a737af98f674 Signed-off-by: Blagovest Kolenichev --- drivers/block/zram/zram_drv.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 06f2623d0199..d4579377f77f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -145,6 +145,14 @@ static inline bool is_partial_io(struct bio_vec *bvec) } #endif +static void zram_revalidate_disk(struct zram *zram) +{ + revalidate_disk(zram->disk); + /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */ + zram->disk->queue->backing_dev_info->capabilities |= + BDI_CAP_STABLE_WRITES; +} + /* * Check if request is within bounds and aligned on zram logical blocks. */ @@ -1509,8 +1517,7 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - - revalidate_disk(zram->disk); + zram_revalidate_disk(zram); up_write(&zram->init_lock); return len; @@ -1557,7 +1564,7 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - revalidate_disk(zram->disk); + zram_revalidate_disk(zram); bdput(bdev); mutex_lock(&bdev->bd_mutex); @@ -1676,7 +1683,6 @@ static int zram_add(void) /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); - /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. @@ -1701,8 +1707,6 @@ static int zram_add(void) if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); - zram->disk->queue->backing_dev_info->capabilities |= - BDI_CAP_STABLE_WRITES; add_disk(zram->disk); ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, From a75f1d1a3edec0cbf5b0ca2ffbef2eff743c7546 Mon Sep 17 00:00:00 2001 From: Blagovest Kolenichev Date: Wed, 3 Apr 2019 02:42:13 -0700 Subject: [PATCH 37/37] Revert "ANDROID: sched/fair: prevent possible infinite loop in sched_group_energy" This reverts commit 727eafb4a89d381f79063f0a988399fe717bc57b. This is a preparation change for merging android-4.14 commit e709f59a8cd2405f59451073c58b3e335c643b27 into msm-4.14 branch. The reverted change is committed already as: 536e7d5d1f sched/fair: prevent possible infinite loop in compute_energy() Change-Id: I1f44e4b332b973f71d8e5945b7557b8c2e98030f Signed-off-by: Blagovest Kolenichev --- kernel/sched/fair.c | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a434181f3d00..45febbc8c004 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6238,21 +6238,11 @@ static int compute_energy(struct energy_env *eenv) int cpu; struct cpumask visit_cpus; struct sched_group *sg; - int cpu_count; WARN_ON(!eenv->sg_top->sge); cpumask_copy(&visit_cpus, sched_group_span(eenv->sg_top)); - /* If a cpu is hotplugged in while we are in this function, it does - * not appear in the existing visit_cpus mask which came from the - * sched_group pointer of the sched_domain pointed at by sd_ea for - * either the prev or next cpu and was dereferenced in - * select_energy_cpu_idx. - * Since we will dereference sd_scs later as we iterate through the - * CPUs we expect to visit, new CPUs can be present which are not in - * the visit_cpus mask. Guard this with cpu_count. - */ - cpu_count = cpumask_weight(&visit_cpus); + while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; @@ -6261,8 +6251,6 @@ static int compute_energy(struct energy_env *eenv) /* * Is the group utilization affected by cpus outside this * sched_group? - * This sd may have groups with cpus which were not present - * when we took visit_cpus. */ sd = rcu_dereference(per_cpu(sd_scs, cpu)); if (sd && sd->parent) @@ -6288,24 +6276,8 @@ static int compute_energy(struct energy_env *eenv) calc_sg_energy(eenv); /* remove CPUs we have just visited */ - if (!sd->child) { - /* - * cpu_count here is the number of - * cpus we expect to visit in this - * calculation. If we race against - * hotplug, we can have extra cpus - * added to the groups we are - * iterating which do not appear in - * the visit_cpus mask. In that case - * we are not able to calculate energy - * without restarting so we will bail - * out and use prev_cpu this time. - */ - if (!cpu_count) - return -EINVAL; + if (!sd->child) cpumask_xor(&visit_cpus, &visit_cpus, sched_group_span(sg)); - cpu_count--; - } if (cpumask_equal(sched_group_span(sg), sched_group_span(eenv->sg_top))) goto next_cpu;