* refs/heads/tmp-6f0b5b5:
Linux 4.9.207
net: stmmac: don't stop NAPI processing when dropping a packet
net: stmmac: use correct DMA buffer size in the RX descriptor
xhci: fix USB3 device initiated resume race with roothub autosuspend
drm/radeon: fix r1xx/r2xx register checker for POT textures
scsi: iscsi: Fix a potential deadlock in the timeout handler
dm btree: increase rebalance threshold in __rebalance2()
dma-buf: Fix memory leak in sync_file_merge()
vfio/pci: call irq_bypass_unregister_producer() before freeing irq
ARM: tegra: Fix FLOW_CTLR_HALT register clobbering by tegra_resume()
ARM: dts: s3c64xx: Fix init order of clock providers
CIFS: Respect O_SYNC and O_DIRECT flags during reconnect
xtensa: fix TLB sanity checker
PCI/MSI: Fix incorrect MSI-X masking on resume
PCI: Fix Intel ACS quirk UPDCR register address
Revert "regulator: Defer init completion for a while after late_initcall"
tcp: Protect accesses to .ts_recent_stamp with {READ,WRITE}_ONCE()
tcp: tighten acceptance of ACKs not matching a child socket
tcp: fix rejected syncookies due to stale timestamps
inet: protect against too small mtu values.
tipc: fix ordering of tipc module init and exit routine
tcp: md5: fix potential overestimation of TCP option space
openvswitch: support asymmetric conntrack
net: ethernet: ti: cpsw: fix extra rx interrupt
net: bridge: deny dev_set_mac_address() when unregistering
nvme: host: core: fix precedence of ternary operator
kernel/module.c: wakeup processes in module_wq on module unload
net/mlx5e: Fix SFF 8472 eeprom length
sunrpc: fix crash when cache_head become valid before update
workqueue: Fix missing kfree(rescuer) in destroy_workqueue()
blk-mq: make sure that line break can be printed
ext4: fix a bug in ext4_wait_for_tail_page_commit
mm/shmem.c: cast the type of unmap_start to u64
firmware: qcom: scm: Ensure 'a0' status code is treated as signed
reiserfs: fix extended attributes on the root directory
powerpc: Fix vDSO clock_getres()
scsi: qla2xxx: Always check the qla2x00_wait_for_hba_online() return value
scsi: qla2xxx: Fix qla24xx_process_bidir_cmd()
scsi: qla2xxx: Fix session lookup in qlt_abort_work()
scsi: qla2xxx: Fix DMA unmap leak
pinctrl: samsung: Fix device node refcount leaks in S3C64xx wakeup controller init
ARM: dts: omap3-tao3530: Fix incorrect MMC card detection GPIO polarity
ath10k: fix fw crash by moving chip reset after napi disabled
x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk
x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models
e100: Fix passing zero to 'PTR_ERR' warning in e100_load_ucode_wait
scsi: lpfc: Cap NPIV vports to 256
omap: pdata-quirks: remove openpandora quirks for mmc3 and wl1251
Btrfs: fix negative subv_writers counter and data space leak after buffered write
iio: adis16480: Add debugfs_reg_access entry
xhci: make sure interrupts are restored to correct state
xhci: Fix memory leak in xhci_add_in_port()
usb: xhci: only set D3hot for pci device
scsi: zfcp: trace channel log even for FCP command responses
quota: fix livelock in dquot_writeback_dquots
ext2: check err when partial != NULL
quota: Check that quota is not dirty before release
video/hdmi: Fix AVI bar unpack
powerpc: Allow 64bit VDSO __kernel_sync_dicache to work across ranges >4GB
ppdev: fix PPGETTIME/PPSETTIME ioctls
mmc: host: omap_hsmmc: add code for special init of wl1251 to get rid of pandora_wl1251_init_card
pinctrl: samsung: Fix device node refcount leaks in init code
pinctrl: samsung: Fix device node refcount leaks in S3C24xx wakeup controller init
ACPI: PM: Avoid attaching ACPI PM domain to certain devices
ACPI: bus: Fix NULL pointer check in acpi_bus_get_private_data()
ACPI: OSL: only free map once in osl.c
PM / devfreq: Lock devfreq in trans_stat_show
cpuidle: Do not unset the driver if it is there already
media: radio: wl1273: fix interrupt masking on release
media: bdisp: fix memleak on release
ar5523: check NULL before memcpy() in ar5523_cmd()
cgroup: pids: use atomic64_t for pids->limit
blk-mq: avoid sysfs buffer overflow with too many CPU cores
ASoC: Jack: Fix NULL pointer dereference in snd_soc_jack_report
workqueue: Fix pwq ref leak in rescuer_thread()
workqueue: Fix spurious sanity check failures in destroy_workqueue()
ALSA: hda - Fix pending unsol events at shutdown
lib: raid6: fix awk build warnings
rtlwifi: rtl8192de: Fix missing enable interrupt flag
rtlwifi: rtl8192de: Fix missing callback that tests for hw release of buffer
rtlwifi: rtl8192de: Fix missing code to retrieve RX buffer address
btrfs: record all roots for rename exchange on a subvol
Btrfs: send, skip backreference walking for extents with many references
btrfs: Remove btrfs_bio::flags member
btrfs: check page->mapping when loading free space cache
virtio-balloon: fix managed page counts when migrating pages between zones
mtd: spear_smi: Fix Write Burst mode
usb: mon: Fix a deadlock in usbmon between mmap and read
usb: core: urb: fix URB structure initialization function
USB: adutux: fix interface sanity check
USB: serial: io_edgeport: fix epic endpoint lookup
USB: idmouse: fix interface sanity checks
USB: atm: ueagle-atm: add missing endpoint check
iio: humidity: hdc100x: fix IIO_HUMIDITYRELATIVE channel reporting
ARM: dts: pandora-common: define wl1251 as child node of mmc3
xhci: Increase STS_HALT timeout in xhci_suspend()
staging: gigaset: add endpoint-type sanity check
staging: gigaset: fix illegal free on probe errors
staging: gigaset: fix general protection fault on probe
staging: rtl8712: fix interface sanity check
staging: rtl8188eu: fix interface sanity check
usb: Allow USB device to be warm reset in suspended state
USB: uas: heed CAPACITY_HEURISTICS
USB: uas: honor flag to avoid CAPACITY16
usb: gadget: configfs: Fix missing spin_lock_init()
appletalk: Set error code if register_snap_client failed
appletalk: Fix potential NULL pointer dereference in unregister_snap_client
KVM: x86: fix out-of-bounds write in KVM_GET_EMULATED_CPUID (CVE-2019-19332)
thermal: Fix deadlock in thermal thermal_zone_device_check
RDMA/qib: Validate ->show()/store() callbacks before calling them
spi: atmel: Fix CS high support
crypto: user - fix memory leak in crypto_report
crypto: ecdh - fix big endian bug in ECC library
crypto: ccp - fix uninitialized list head
crypto: crypto4xx - fix double-free in crypto4xx_destroy_sdr
KVM: x86: fix presentation of TSX feature in ARCH_CAPABILITIES
KVM: x86: do not modify masked bits of shared MSRs
drm/i810: Prevent underflow in ioctl
jbd2: Fix possible overflow in jbd2_log_space_left()
can: slcan: Fix use-after-free Read in slcan_open
tty: vt: keyboard: reject invalid keycodes
CIFS: Fix SMB2 oplock break processing
CIFS: Fix NULL-pointer dereference in smb2_push_mandatory_locks
x86/PCI: Avoid AMD FCH XHCI USB PME# from D0 defect
coresight: etm4x: Fix input validation for sysfs.
Input: goodix - add upside-down quirk for Teclast X89 tablet
ALSA: pcm: oss: Avoid potential buffer overflows
fuse: verify attributes
fuse: verify nlink
sched/fair: Scale bandwidth quota and period without losing quota/period ratio precision
ARM: dts: sunxi: Fix PMU compatible strings
mlx4: Use snprintf instead of complicated strcpy
media: stkwebcam: Bugfix for wrong return values
tty: Don't block on IO when ldisc change is pending
nfsd: Return EPERM, not EACCES, in some SETATTR cases
MIPS: OCTEON: cvmx_pko_mem_debug8: use oldest forward compatible definition
powerpc/math-emu: Update macros from GCC
net/mlx4_core: Fix return codes of unsupported operations
dlm: fix invalid cluster name warning
ARM: dts: realview: Fix some more duplicate regulator nodes
clk: sunxi-ng: h3/h5: Fix CSI_MCLK parent
ARM: dts: pxa: clean up USB controller nodes
mtd: fix mtd_oobavail() incoherent returned value
kbuild: fix single target build for external module
modpost: skip ELF local symbols during section mismatch check
tcp: fix SNMP TCP timeout under-estimation
tcp: fix off-by-one bug on aborting window-probing socket
ARM: dts: realview-pbx: Fix duplicate regulator nodes
ARM: dts: mmp2: fix the gpio interrupt cell number
net/x25: fix null_x25_address handling
net/x25: fix called/calling length calculation in x25_parse_address_block
ARM: OMAP1/2: fix SoC name printing
nfsd: fix a warning in __cld_pipe_upcall()
ARM: debug: enable UART1 for socfpga Cyclone5
dlm: NULL check before kmem_cache_destroy is not needed
i2c: imx: don't print error message on probe defer
serial: imx: fix error handling in console_setup
altera-stapl: check for a null key before strcasecmp'ing it
dma-mapping: fix return type of dma_set_max_seg_size()
ACPI: fix acpi_find_child_device() invocation in acpi_preset_companion()
usb: dwc3: don't log probe deferrals; but do log other error codes
dmaengine: coh901318: Remove unused variable
dmaengine: coh901318: Fix a double-lock bug
media: pulse8-cec: return 0 when invalidating the logical address
ARM: dts: exynos: Use Samsung SoC specific compatible for DWC2 module
rtc: dt-binding: abx80x: fix resistance scale
rtc: max8997: Fix the returned value in case of error in 'max8997_rtc_read_alarm()'
math-emu/soft-fp.h: (_FP_ROUND_ZERO) cast 0 to void to fix warning
MIPS: OCTEON: octeon-platform: fix typing
regulator: Fix return value of _set_load() stub
Staging: iio: adt7316: Fix i2c data reading, set the data field
pinctrl: qcom: ssbi-gpio: fix gpio-hog related boot issues
scsi: zfcp: drop default switch case which might paper over missing case
MIPS: SiByte: Enable ZONE_DMA32 for LittleSur
dlm: fix missing idr_destroy for recover_idr
ARM: dts: rockchip: Fix rk3288-rock2 vcc_flash name
clk: rockchip: fix rk3188 sclk_mac_lbtest parameter ordering
clk: rockchip: fix rk3188 sclk_smc gate data
extcon: max8997: Fix lack of path setting in USB device mode
net/mlx5: Release resource on error flow
ARM: 8813/1: Make aligned 2-byte getuser()/putuser() atomic on ARMv6+
iwlwifi: mvm: Send non offchannel traffic via AP sta
cxgb4vf: fix memleak in mac_hlist initialization
serial: core: Allow processing sysrq at port unlock time
net: ep93xx_eth: fix mismatch of request_mem_region in remove
rsxx: add missed destroy_workqueue calls in remove
ALSA: pcm: Fix stream lock usage in snd_pcm_period_elapsed()
Input: cyttsp4_core - fix use after free bug
NFC: nxp-nci: Fix NULL pointer dereference after I2C communication error
audit_get_nd(): don't unlock parent too early
exportfs_decode_fh(): negative pinned may become positive without the parent locked
RDMA/hns: Correct the value of HNS_ROCE_HEM_CHUNK_LEN
autofs: fix a leak in autofs_expire_indirect()
serial: ifx6x60: add missed pm_runtime_disable
serial: serial_core: Perform NULL checks for break_ctl ops
serial: pl011: Fix DMA ->flush_buffer()
tty: serial: msm_serial: Fix flow control
tty: serial: fsl_lpuart: use the sg count from dma_map_sg
usb: gadget: u_serial: add missing port entry locking
arm64: tegra: Fix 'active-low' warning for Jetson TX1 regulator
UPSTREAM: binder: fix incorrect calculation for num_valid
ANDROID: sched/core: Fix arm32 allmodconfig build-break
BACKPORT: bpf: permit multiple bpf attachments for a single perf event
UPSTREAM: bpf: use the same condition in perf event set/free bpf handler
BACKPORT: bpf: multi program support for cgroup+bpf
Conflicts:
drivers/usb/dwc3/core.c
drivers/usb/host/xhci.c
include/trace/events/sched.h
Change-Id: Ida9ef3977cd175731c312341385819f5812f707c
Signed-off-by: jianzhou <jianzhou@codeaurora.org>
751 lines
20 KiB
C
751 lines
20 KiB
C
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
|
|
* Copyright (c) 2016 Facebook
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of version 2 of the GNU General Public
|
|
* License as published by the Free Software Foundation.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/types.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/bpf.h>
|
|
#include <linux/bpf_perf_event.h>
|
|
#include <linux/filter.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/ctype.h>
|
|
#include "trace.h"
|
|
|
|
/**
|
|
* trace_call_bpf - invoke BPF program
|
|
* @call: tracepoint event
|
|
* @ctx: opaque context pointer
|
|
*
|
|
* kprobe handlers execute BPF programs via this helper.
|
|
* Can be used from static tracepoints in the future.
|
|
*
|
|
* Return: BPF programs always return an integer which is interpreted by
|
|
* kprobe handler as:
|
|
* 0 - return from kprobe (event is filtered out)
|
|
* 1 - store kprobe event into ring buffer
|
|
* Other values are reserved and currently alias to 1
|
|
*/
|
|
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
|
|
{
|
|
unsigned int ret;
|
|
|
|
if (in_nmi()) /* not supported yet */
|
|
return 1;
|
|
|
|
preempt_disable();
|
|
|
|
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
|
|
/*
|
|
* since some bpf program is already running on this cpu,
|
|
* don't call into another bpf program (same or different)
|
|
* and don't send kprobe event into ring-buffer,
|
|
* so return zero here
|
|
*/
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
|
|
* to all call sites, we did a bpf_prog_array_valid() there to check
|
|
* whether call->prog_array is empty or not, which is
|
|
* a heurisitc to speed up execution.
|
|
*
|
|
* If bpf_prog_array_valid() fetched prog_array was
|
|
* non-NULL, we go into trace_call_bpf() and do the actual
|
|
* proper rcu_dereference() under RCU lock.
|
|
* If it turns out that prog_array is NULL then, we bail out.
|
|
* For the opposite, if the bpf_prog_array_valid() fetched pointer
|
|
* was NULL, you'll skip the prog_array with the risk of missing
|
|
* out of events when it was updated in between this and the
|
|
* rcu_dereference() which is accepted risk.
|
|
*/
|
|
ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
|
|
|
|
out:
|
|
__this_cpu_dec(bpf_prog_active);
|
|
preempt_enable();
|
|
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(trace_call_bpf);
|
|
|
|
BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
|
|
{
|
|
int ret;
|
|
|
|
ret = probe_kernel_read(dst, unsafe_ptr, size);
|
|
if (unlikely(ret < 0))
|
|
memset(dst, 0, size);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static const struct bpf_func_proto bpf_probe_read_proto = {
|
|
.func = bpf_probe_read,
|
|
.gpl_only = true,
|
|
.ret_type = RET_INTEGER,
|
|
.arg1_type = ARG_PTR_TO_RAW_STACK,
|
|
.arg2_type = ARG_CONST_STACK_SIZE,
|
|
.arg3_type = ARG_ANYTHING,
|
|
};
|
|
|
|
BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, const void *, unsafe_ptr)
|
|
{
|
|
int ret;
|
|
|
|
/*
|
|
* The strncpy_from_unsafe() call will likely not fill the entire
|
|
* buffer, but that's okay in this circumstance as we're probing
|
|
* arbitrary memory anyway similar to bpf_probe_read() and might
|
|
* as well probe the stack. Thus, memory is explicitly cleared
|
|
* only in error case, so that improper users ignoring return
|
|
* code altogether don't copy garbage; otherwise length of string
|
|
* is returned that can be used for bpf_perf_event_output() et al.
|
|
*/
|
|
ret = strncpy_from_unsafe(dst, unsafe_ptr, size);
|
|
if (unlikely(ret < 0))
|
|
memset(dst, 0, size);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static const struct bpf_func_proto bpf_probe_read_str_proto = {
|
|
.func = bpf_probe_read_str,
|
|
.gpl_only = true,
|
|
.ret_type = RET_INTEGER,
|
|
.arg1_type = ARG_PTR_TO_RAW_STACK,
|
|
.arg2_type = ARG_CONST_STACK_SIZE,
|
|
.arg3_type = ARG_ANYTHING,
|
|
};
|
|
|
|
BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
|
|
u32, size)
|
|
{
|
|
/*
|
|
* Ensure we're in user context which is safe for the helper to
|
|
* run. This helper has no business in a kthread.
|
|
*
|
|
* access_ok() should prevent writing to non-user memory, but in
|
|
* some situations (nommu, temporary switch, etc) access_ok() does
|
|
* not provide enough validation, hence the check on KERNEL_DS.
|
|
*/
|
|
|
|
if (unlikely(in_interrupt() ||
|
|
current->flags & (PF_KTHREAD | PF_EXITING)))
|
|
return -EPERM;
|
|
if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
|
|
return -EPERM;
|
|
if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
|
|
return -EPERM;
|
|
|
|
return probe_kernel_write(unsafe_ptr, src, size);
|
|
}
|
|
|
|
static const struct bpf_func_proto bpf_probe_write_user_proto = {
|
|
.func = bpf_probe_write_user,
|
|
.gpl_only = true,
|
|
.ret_type = RET_INTEGER,
|
|
.arg1_type = ARG_ANYTHING,
|
|
.arg2_type = ARG_PTR_TO_STACK,
|
|
.arg3_type = ARG_CONST_STACK_SIZE,
|
|
};
|
|
|
|
static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
|
|
{
|
|
pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
|
|
current->comm, task_pid_nr(current));
|
|
|
|
return &bpf_probe_write_user_proto;
|
|
}
|
|
|
|
/*
|
|
* limited trace_printk()
|
|
* only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
|
|
*/
|
|
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
|
|
u64, arg2, u64, arg3)
|
|
{
|
|
bool str_seen = false;
|
|
int mod[3] = {};
|
|
int fmt_cnt = 0;
|
|
u64 unsafe_addr;
|
|
char buf[64];
|
|
int i;
|
|
|
|
/*
|
|
* bpf_check()->check_func_arg()->check_stack_boundary()
|
|
* guarantees that fmt points to bpf program stack,
|
|
* fmt_size bytes of it were initialized and fmt_size > 0
|
|
*/
|
|
if (fmt[--fmt_size] != 0)
|
|
return -EINVAL;
|
|
|
|
/* check format string for allowed specifiers */
|
|
for (i = 0; i < fmt_size; i++) {
|
|
if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
|
|
return -EINVAL;
|
|
|
|
if (fmt[i] != '%')
|
|
continue;
|
|
|
|
if (fmt_cnt >= 3)
|
|
return -EINVAL;
|
|
|
|
/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
|
|
i++;
|
|
if (fmt[i] == 'l') {
|
|
mod[fmt_cnt]++;
|
|
i++;
|
|
} else if (fmt[i] == 'p' || fmt[i] == 's') {
|
|
mod[fmt_cnt]++;
|
|
/* disallow any further format extensions */
|
|
if (fmt[i + 1] != 0 &&
|
|
!isspace(fmt[i + 1]) &&
|
|
!ispunct(fmt[i + 1]))
|
|
return -EINVAL;
|
|
fmt_cnt++;
|
|
if (fmt[i] == 's') {
|
|
if (str_seen)
|
|
/* allow only one '%s' per fmt string */
|
|
return -EINVAL;
|
|
str_seen = true;
|
|
|
|
switch (fmt_cnt) {
|
|
case 1:
|
|
unsafe_addr = arg1;
|
|
arg1 = (long) buf;
|
|
break;
|
|
case 2:
|
|
unsafe_addr = arg2;
|
|
arg2 = (long) buf;
|
|
break;
|
|
case 3:
|
|
unsafe_addr = arg3;
|
|
arg3 = (long) buf;
|
|
break;
|
|
}
|
|
buf[0] = 0;
|
|
strncpy_from_unsafe(buf,
|
|
(void *) (long) unsafe_addr,
|
|
sizeof(buf));
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (fmt[i] == 'l') {
|
|
mod[fmt_cnt]++;
|
|
i++;
|
|
}
|
|
|
|
if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
|
|
return -EINVAL;
|
|
fmt_cnt++;
|
|
}
|
|
|
|
/* Horrid workaround for getting va_list handling working with different
|
|
* argument type combinations generically for 32 and 64 bit archs.
|
|
*/
|
|
#define __BPF_TP_EMIT() __BPF_ARG3_TP()
|
|
#define __BPF_TP(...) \
|
|
__trace_printk(1 /* Fake ip will not be printed. */, \
|
|
fmt, ##__VA_ARGS__)
|
|
|
|
#define __BPF_ARG1_TP(...) \
|
|
((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
|
|
? __BPF_TP(arg1, ##__VA_ARGS__) \
|
|
: ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
|
|
? __BPF_TP((long)arg1, ##__VA_ARGS__) \
|
|
: __BPF_TP((u32)arg1, ##__VA_ARGS__)))
|
|
|
|
#define __BPF_ARG2_TP(...) \
|
|
((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
|
|
? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
|
|
: ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
|
|
? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
|
|
: __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
|
|
|
|
#define __BPF_ARG3_TP(...) \
|
|
((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
|
|
? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
|
|
: ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
|
|
? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
|
|
: __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
|
|
|
|
return __BPF_TP_EMIT();
|
|
}
|
|
|
|
static const struct bpf_func_proto bpf_trace_printk_proto = {
|
|
.func = bpf_trace_printk,
|
|
.gpl_only = true,
|
|
.ret_type = RET_INTEGER,
|
|
.arg1_type = ARG_PTR_TO_STACK,
|
|
.arg2_type = ARG_CONST_STACK_SIZE,
|
|
};
|
|
|
|
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
|
|
{
|
|
/*
|
|
* this program might be calling bpf_trace_printk,
|
|
* so allocate per-cpu printk buffers
|
|
*/
|
|
trace_printk_init_buffers();
|
|
|
|
return &bpf_trace_printk_proto;
|
|
}
|
|
|
|
BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
|
|
{
|
|
struct bpf_array *array = container_of(map, struct bpf_array, map);
|
|
unsigned int cpu = smp_processor_id();
|
|
u64 index = flags & BPF_F_INDEX_MASK;
|
|
struct bpf_event_entry *ee;
|
|
struct perf_event *event;
|
|
|
|
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
|
|
return -EINVAL;
|
|
if (index == BPF_F_CURRENT_CPU)
|
|
index = cpu;
|
|
if (unlikely(index >= array->map.max_entries))
|
|
return -E2BIG;
|
|
|
|
ee = READ_ONCE(array->ptrs[index]);
|
|
if (!ee)
|
|
return -ENOENT;
|
|
|
|
event = ee->event;
|
|
if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
|
|
event->attr.type != PERF_TYPE_RAW))
|
|
return -EINVAL;
|
|
|
|
/* make sure event is local and doesn't have pmu::count */
|
|
if (unlikely(event->oncpu != cpu || event->pmu->count))
|
|
return -EINVAL;
|
|
|
|
/*
|
|
* we don't know if the function is run successfully by the
|
|
* return value. It can be judged in other places, such as
|
|
* eBPF programs.
|
|
*/
|
|
return perf_event_read_local(event);
|
|
}
|
|
|
|
static const struct bpf_func_proto bpf_perf_event_read_proto = {
|
|
.func = bpf_perf_event_read,
|
|
.gpl_only = true,
|
|
.ret_type = RET_INTEGER,
|
|
.arg1_type = ARG_CONST_MAP_PTR,
|
|
.arg2_type = ARG_ANYTHING,
|
|
};
|
|
|
|
static __always_inline u64
|
|
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
|
|
u64 flags, struct perf_raw_record *raw)
|
|
{
|
|
struct bpf_array *array = container_of(map, struct bpf_array, map);
|
|
unsigned int cpu = smp_processor_id();
|
|
u64 index = flags & BPF_F_INDEX_MASK;
|
|
struct perf_sample_data sample_data;
|
|
struct bpf_event_entry *ee;
|
|
struct perf_event *event;
|
|
|
|
if (index == BPF_F_CURRENT_CPU)
|
|
index = cpu;
|
|
if (unlikely(index >= array->map.max_entries))
|
|
return -E2BIG;
|
|
|
|
ee = READ_ONCE(array->ptrs[index]);
|
|
if (!ee)
|
|
return -ENOENT;
|
|
|
|
event = ee->event;
|
|
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
|
|
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
|
|
return -EINVAL;
|
|
|
|
if (unlikely(event->oncpu != cpu))
|
|
return -EOPNOTSUPP;
|
|
|
|
perf_sample_data_init(&sample_data, 0, 0);
|
|
sample_data.raw = raw;
|
|
perf_event_output(event, &sample_data, regs);
|
|
return 0;
|
|
}
|
|
|
|
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
|
|
u64, flags, void *, data, u64, size)
|
|
{
|
|
struct perf_raw_record raw = {
|
|
.frag = {
|
|
.size = size,
|
|
.data = data,
|
|
},
|
|
};
|
|
|
|
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
|
|
return -EINVAL;
|
|
|
|
return __bpf_perf_event_output(regs, map, flags, &raw);
|
|
}
|
|
|
|
static const struct bpf_func_proto bpf_perf_event_output_proto = {
|
|
.func = bpf_perf_event_output,
|
|
.gpl_only = true,
|
|
.ret_type = RET_INTEGER,
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
.arg2_type = ARG_CONST_MAP_PTR,
|
|
.arg3_type = ARG_ANYTHING,
|
|
.arg4_type = ARG_PTR_TO_STACK,
|
|
.arg5_type = ARG_CONST_STACK_SIZE,
|
|
};
|
|
|
|
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
|
|
|
|
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
|
|
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
|
|
{
|
|
struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
|
|
struct perf_raw_frag frag = {
|
|
.copy = ctx_copy,
|
|
.size = ctx_size,
|
|
.data = ctx,
|
|
};
|
|
struct perf_raw_record raw = {
|
|
.frag = {
|
|
{
|
|
.next = ctx_size ? &frag : NULL,
|
|
},
|
|
.size = meta_size,
|
|
.data = meta,
|
|
},
|
|
};
|
|
|
|
perf_fetch_caller_regs(regs);
|
|
|
|
return __bpf_perf_event_output(regs, map, flags, &raw);
|
|
}
|
|
|
|
BPF_CALL_0(bpf_get_current_task)
|
|
{
|
|
return (long) current;
|
|
}
|
|
|
|
static const struct bpf_func_proto bpf_get_current_task_proto = {
|
|
.func = bpf_get_current_task,
|
|
.gpl_only = true,
|
|
.ret_type = RET_INTEGER,
|
|
};
|
|
|
|
BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
|
|
{
|
|
struct bpf_array *array = container_of(map, struct bpf_array, map);
|
|
struct cgroup *cgrp;
|
|
|
|
if (unlikely(in_interrupt()))
|
|
return -EINVAL;
|
|
if (unlikely(idx >= array->map.max_entries))
|
|
return -E2BIG;
|
|
|
|
cgrp = READ_ONCE(array->ptrs[idx]);
|
|
if (unlikely(!cgrp))
|
|
return -EAGAIN;
|
|
|
|
return task_under_cgroup_hierarchy(current, cgrp);
|
|
}
|
|
|
|
static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
|
|
.func = bpf_current_task_under_cgroup,
|
|
.gpl_only = false,
|
|
.ret_type = RET_INTEGER,
|
|
.arg1_type = ARG_CONST_MAP_PTR,
|
|
.arg2_type = ARG_ANYTHING,
|
|
};
|
|
|
|
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
|
|
{
|
|
switch (func_id) {
|
|
case BPF_FUNC_map_lookup_elem:
|
|
return &bpf_map_lookup_elem_proto;
|
|
case BPF_FUNC_map_update_elem:
|
|
return &bpf_map_update_elem_proto;
|
|
case BPF_FUNC_map_delete_elem:
|
|
return &bpf_map_delete_elem_proto;
|
|
case BPF_FUNC_probe_read:
|
|
return &bpf_probe_read_proto;
|
|
case BPF_FUNC_probe_read_str:
|
|
return &bpf_probe_read_str_proto;
|
|
case BPF_FUNC_ktime_get_ns:
|
|
return &bpf_ktime_get_ns_proto;
|
|
case BPF_FUNC_tail_call:
|
|
return &bpf_tail_call_proto;
|
|
case BPF_FUNC_get_current_pid_tgid:
|
|
return &bpf_get_current_pid_tgid_proto;
|
|
case BPF_FUNC_get_current_task:
|
|
return &bpf_get_current_task_proto;
|
|
case BPF_FUNC_get_current_uid_gid:
|
|
return &bpf_get_current_uid_gid_proto;
|
|
case BPF_FUNC_get_current_comm:
|
|
return &bpf_get_current_comm_proto;
|
|
case BPF_FUNC_trace_printk:
|
|
return bpf_get_trace_printk_proto();
|
|
case BPF_FUNC_get_smp_processor_id:
|
|
return &bpf_get_smp_processor_id_proto;
|
|
case BPF_FUNC_perf_event_read:
|
|
return &bpf_perf_event_read_proto;
|
|
case BPF_FUNC_probe_write_user:
|
|
return bpf_get_probe_write_proto();
|
|
case BPF_FUNC_current_task_under_cgroup:
|
|
return &bpf_current_task_under_cgroup_proto;
|
|
case BPF_FUNC_get_prandom_u32:
|
|
return &bpf_get_prandom_u32_proto;
|
|
default:
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
|
|
{
|
|
switch (func_id) {
|
|
case BPF_FUNC_perf_event_output:
|
|
return &bpf_perf_event_output_proto;
|
|
case BPF_FUNC_get_stackid:
|
|
return &bpf_get_stackid_proto;
|
|
default:
|
|
return tracing_func_proto(func_id);
|
|
}
|
|
}
|
|
|
|
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
|
|
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
|
|
enum bpf_reg_type *reg_type)
|
|
{
|
|
if (off < 0 || off >= sizeof(struct pt_regs))
|
|
return false;
|
|
if (type != BPF_READ)
|
|
return false;
|
|
if (off % size != 0)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
static const struct bpf_verifier_ops kprobe_prog_ops = {
|
|
.get_func_proto = kprobe_prog_func_proto,
|
|
.is_valid_access = kprobe_prog_is_valid_access,
|
|
};
|
|
|
|
static struct bpf_prog_type_list kprobe_tl = {
|
|
.ops = &kprobe_prog_ops,
|
|
.type = BPF_PROG_TYPE_KPROBE,
|
|
};
|
|
|
|
BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
|
|
u64, flags, void *, data, u64, size)
|
|
{
|
|
struct pt_regs *regs = *(struct pt_regs **)tp_buff;
|
|
|
|
/*
|
|
* r1 points to perf tracepoint buffer where first 8 bytes are hidden
|
|
* from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
|
|
* from there and call the same bpf_perf_event_output() helper inline.
|
|
*/
|
|
return ____bpf_perf_event_output(regs, map, flags, data, size);
|
|
}
|
|
|
|
static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
|
|
.func = bpf_perf_event_output_tp,
|
|
.gpl_only = true,
|
|
.ret_type = RET_INTEGER,
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
.arg2_type = ARG_CONST_MAP_PTR,
|
|
.arg3_type = ARG_ANYTHING,
|
|
.arg4_type = ARG_PTR_TO_STACK,
|
|
.arg5_type = ARG_CONST_STACK_SIZE,
|
|
};
|
|
|
|
BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
|
|
u64, flags)
|
|
{
|
|
struct pt_regs *regs = *(struct pt_regs **)tp_buff;
|
|
|
|
/*
|
|
* Same comment as in bpf_perf_event_output_tp(), only that this time
|
|
* the other helper's function body cannot be inlined due to being
|
|
* external, thus we need to call raw helper function.
|
|
*/
|
|
return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
|
|
flags, 0, 0);
|
|
}
|
|
|
|
static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
|
|
.func = bpf_get_stackid_tp,
|
|
.gpl_only = true,
|
|
.ret_type = RET_INTEGER,
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
.arg2_type = ARG_CONST_MAP_PTR,
|
|
.arg3_type = ARG_ANYTHING,
|
|
};
|
|
|
|
static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
|
|
{
|
|
switch (func_id) {
|
|
case BPF_FUNC_perf_event_output:
|
|
return &bpf_perf_event_output_proto_tp;
|
|
case BPF_FUNC_get_stackid:
|
|
return &bpf_get_stackid_proto_tp;
|
|
default:
|
|
return tracing_func_proto(func_id);
|
|
}
|
|
}
|
|
|
|
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
|
|
enum bpf_reg_type *reg_type)
|
|
{
|
|
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
|
|
return false;
|
|
if (type != BPF_READ)
|
|
return false;
|
|
if (off % size != 0)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
static const struct bpf_verifier_ops tracepoint_prog_ops = {
|
|
.get_func_proto = tp_prog_func_proto,
|
|
.is_valid_access = tp_prog_is_valid_access,
|
|
};
|
|
|
|
static struct bpf_prog_type_list tracepoint_tl = {
|
|
.ops = &tracepoint_prog_ops,
|
|
.type = BPF_PROG_TYPE_TRACEPOINT,
|
|
};
|
|
|
|
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
|
|
enum bpf_reg_type *reg_type)
|
|
{
|
|
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
|
|
return false;
|
|
if (type != BPF_READ)
|
|
return false;
|
|
if (off % size != 0)
|
|
return false;
|
|
if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
|
|
if (size != sizeof(u64))
|
|
return false;
|
|
} else {
|
|
if (size != sizeof(long))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
|
|
int src_reg, int ctx_off,
|
|
struct bpf_insn *insn_buf,
|
|
struct bpf_prog *prog)
|
|
{
|
|
struct bpf_insn *insn = insn_buf;
|
|
|
|
switch (ctx_off) {
|
|
case offsetof(struct bpf_perf_event_data, sample_period):
|
|
BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
|
|
|
|
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
|
|
data), dst_reg, src_reg,
|
|
offsetof(struct bpf_perf_event_data_kern, data));
|
|
*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
|
|
offsetof(struct perf_sample_data, period));
|
|
break;
|
|
default:
|
|
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
|
|
regs), dst_reg, src_reg,
|
|
offsetof(struct bpf_perf_event_data_kern, regs));
|
|
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
|
|
break;
|
|
}
|
|
|
|
return insn - insn_buf;
|
|
}
|
|
|
|
static const struct bpf_verifier_ops perf_event_prog_ops = {
|
|
.get_func_proto = tp_prog_func_proto,
|
|
.is_valid_access = pe_prog_is_valid_access,
|
|
.convert_ctx_access = pe_prog_convert_ctx_access,
|
|
};
|
|
|
|
static DEFINE_MUTEX(bpf_event_mutex);
|
|
|
|
int perf_event_attach_bpf_prog(struct perf_event *event,
|
|
struct bpf_prog *prog)
|
|
{
|
|
struct bpf_prog_array __rcu *old_array;
|
|
struct bpf_prog_array *new_array;
|
|
int ret = -EEXIST;
|
|
|
|
mutex_lock(&bpf_event_mutex);
|
|
|
|
if (event->prog)
|
|
goto out;
|
|
|
|
old_array = rcu_dereference_protected(event->tp_event->prog_array,
|
|
lockdep_is_held(&bpf_event_mutex));
|
|
ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
/* set the new array to event->tp_event and set event->prog */
|
|
event->prog = prog;
|
|
rcu_assign_pointer(event->tp_event->prog_array, new_array);
|
|
bpf_prog_array_free(old_array);
|
|
|
|
out:
|
|
mutex_unlock(&bpf_event_mutex);
|
|
return ret;
|
|
}
|
|
|
|
void perf_event_detach_bpf_prog(struct perf_event *event)
|
|
{
|
|
struct bpf_prog_array __rcu *old_array;
|
|
struct bpf_prog_array *new_array;
|
|
int ret;
|
|
|
|
mutex_lock(&bpf_event_mutex);
|
|
|
|
if (!event->prog)
|
|
goto out;
|
|
|
|
old_array = rcu_dereference_protected(event->tp_event->prog_array,
|
|
lockdep_is_held(&bpf_event_mutex));
|
|
|
|
ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
|
|
if (ret < 0) {
|
|
bpf_prog_array_delete_safe(old_array, event->prog);
|
|
} else {
|
|
rcu_assign_pointer(event->tp_event->prog_array, new_array);
|
|
bpf_prog_array_free(old_array);
|
|
}
|
|
|
|
bpf_prog_put(event->prog);
|
|
event->prog = NULL;
|
|
|
|
out:
|
|
mutex_unlock(&bpf_event_mutex);
|
|
}
|
|
|
|
static struct bpf_prog_type_list perf_event_tl = {
|
|
.ops = &perf_event_prog_ops,
|
|
.type = BPF_PROG_TYPE_PERF_EVENT,
|
|
};
|
|
|
|
static int __init register_kprobe_prog_ops(void)
|
|
{
|
|
bpf_register_prog_type(&kprobe_tl);
|
|
bpf_register_prog_type(&tracepoint_tl);
|
|
bpf_register_prog_type(&perf_event_tl);
|
|
return 0;
|
|
}
|
|
late_initcall(register_kprobe_prog_ops);
|