Squashed commit of the following: commit 8ac5df9c8bc9575059fff6cea0c40463b96fc129 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:58:17 2024 -0500 Revert "BACKPORT: bpf: add skb_load_bytes_relative helper" This reverts commit 029893dcc5d67af16fdf0723bacaae37ec567f67. commit dbcbceafe848744ec188f74e87e9717916d359ea Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:58:13 2024 -0500 Revert "BACKPORT: bpf: encapsulate verifier log state into a structure" This reverts commit d861145b97d247cbd9fe1400df52155f48639126. commit 478f4dfee0406b54525e68764cc9ba48af1624fc Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:58:10 2024 -0500 Revert "BACKPORT: bpf: Rename bpf_verifer_log" This reverts commit 5d088635de1bf2d6ae9ea94e3dd1c601d30c0cce. commit 7bc7c24beb82168b49337530cb56b5dfeeafe19a Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:58:07 2024 -0500 Revert "BACKPORT: bpf: btf: Introduce BPF Type Format (BTF)" This reverts commit 93d34e26514b4d9d15fd176706f57634b2e97485. commit 7106457ba90a459b6241fdd44df658c1b52c0e4b Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:58:03 2024 -0500 Revert "bpf: Update logging functions to work with BTF" This reverts commit 97e6c528eb2f76c58a3b6a4c1e7fbeafcd97633a. commit 08e68c7ba56f5e78fd1afcd5a2164716a75b0fe3 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:58:00 2024 -0500 Revert "bpf: btf: Validate type reference" This reverts commit c7b7eecbc1134e5d8865af2cc0692fc7156175d5. commit 7763cf0831970a64ed62f9b7362fca02ab6e83f1 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:57:51 2024 -0500 Revert "bpf: btf: Check members of struct/union" This reverts commit 9a77b51cad6f04866ca067ca0e70a89b9f59ed56. commit eb033235f666b5f66995f4cf89702de7ab4721f8 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:57:47 2024 -0500 Revert "bpf: btf: Add pretty print capability for data with BTF type info" This reverts commit 745692103435221d6e39bc177811769995540525. commit c32995674ace91e06c591d2f63177585e81adc75 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:57:43 2024 -0500 Revert "BACKPORT: bpf: btf: Add BPF_BTF_LOAD command" This reverts commit 4e0afd38e20e5aa2df444361309bc07251ca6b2a. commit 1310bc8d4aca0015c8723e7624121eddf76b3244 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:57:38 2024 -0500 Revert "bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd" This reverts commit d4b5d76d9101b97e6fe5181bcefe7f601ed19926. commit 881a49445608712bdb0a0f0c959838bdbc725f62 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:57:34 2024 -0500 Revert "BACKPORT: bpf: btf: Clean up btf.h in uapi" This reverts commit 26b661822933d41b3feb59bb284334bfbbc82af4. commit e2109fd858ebd5fe392c8bf579b9350fbca35a35 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:57:29 2024 -0500 Revert "bpf: btf: Avoid WARN_ON when CONFIG_REFCOUNT_FULL=y" This reverts commit 9abf878903404e649fef4ad0b189eec1c13d29fe. commit 088a7d9137f03da4e0fc1d72add3901823081ccd Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:57:23 2024 -0500 Revert "bpf: Fix compiler warning on info.map_ids for 32bit platform" This reverts commit a3a278e1f6cf167d538ac52f4ad60bb9cf8d4129. commit 6e14aed6b63f2b266982454d83678445c062cf39 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:57:13 2024 -0500 Revert "bpf: btf: Change how section is supported in btf_header" This reverts commit 4b60ffd683eb623a184b46761777838d7c49e707. commit 151a60855c23bf0317734031481d779efb369d6c Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:57:08 2024 -0500 Revert "bpf: btf: Check array->index_type" This reverts commit b00e10f1a073fadce178b6fb62496722e16db303. commit 49775e9074a54ac5f60f518e6fc5a26172996eae Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:57:01 2024 -0500 Revert "bpf: btf: Remove unused bits from uapi/linux/btf.h" This reverts commit c90c6ad34f7a8f565f351d21c2d5b9706838767d. commit b6d6c6ab28e4b018da6ce9e64125e63f4191d3d9 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:58 2024 -0500 Revert "bpf: btf: Avoid variable length array" This reverts commit fe7d1f7750242e77a73839d173ac36c3e39d4171. commit a45bedecb9b1175fef96f2d64fba2d61777dbf35 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:49 2024 -0500 Revert "bpf: btf: avoid -Wreturn-type warning" This reverts commit 78214f1e390bf1d69d9ae4ee80072ac85c34619e. commit 445efb8465b9fa5706d81098417f15656265322e Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:46 2024 -0500 Revert "bpf: btf: Check array t->size" This reverts commit aed532e7466f77885a362e4b863bf90c41e834ba. commit 8aada590d525de735cf39196d88722e727c141e9 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:42 2024 -0500 Revert "bpf: btf: Ensure t->type == 0 for BTF_KIND_FWD" This reverts commit 8c8b601dcc2e62e1276b73dfee8b49e40fb65944. commit ed67ad09e866c9c30897488088bbb4555ea3dc80 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:38 2024 -0500 Revert "bpf: btf: Fix bitfield extraction for big endian" This reverts commit b0696a226c52868d64963f01665dd1a640a92f2b. commit 5cc64db782daf86cdf7ac77133ca94181bb29146 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:35 2024 -0500 Revert "bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h" This reverts commit 0f008594540b09c667ea88fc87cf289b8db334da. commit 3a5c6b9010426449c08ecdcc10e758431b1e515f Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:31 2024 -0500 Revert "bpf: btf: Ensure the member->offset is in the right order" This reverts commit c5e361ecd6d45a7cdbffda02e4691a7a37198bdd. commit bd6173c1ac458b08d6cedaf06e6e53c93e6b0cc5 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:26 2024 -0500 Revert "bpf: fix bpf_skb_load_bytes_relative pkt length check" This reverts commit 9ea14969874cd7896588df435c890f6f2f547821. commit 0b61d26b25a65d9ded4611426c6da9c78e41567c Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:22 2024 -0500 Revert "bpf: btf: Fix end boundary calculation for type section" This reverts commit 08ef221c7fb604cb60c490fa999ec7254d492f05. commit 72fb2b9bb5b90f60ab71915fe4e57eeee3308163 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:18 2024 -0500 Revert "bpf: btf: Fix a missing check bug" This reverts commit 594687e3e01e26086f3b0173e5eda9b9f0b672f8. commit 575a34ceba4013ad0230038f29f6ea0b3ba41a7e Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:15 2024 -0500 Revert "bpf, btf: fix a missing check bug in btf_parse" This reverts commit 6bf31bbc438663756e92fb0aad4f5a35fd730fb0. commit bcca98c0bc5e19b38af3ddcd0feee80ad26e1f96 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:11 2024 -0500 Revert "bpf: fix BTF limits" This reverts commit e351b26ae671dfacd82f27c1c5f66cf8089d930d. commit f71c484e340041d8828c94b39a233ea587d8cc09 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:07 2024 -0500 Revert "bpf/btf: Fix BTF verification of enum members in struct/union" This reverts commit 861e65b744c171d59850e61a01715f194f25e45c. commit eca310722a2624d33cd49884aa18c36d435b10f8 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:56:02 2024 -0500 Revert "bpf: btf: fix truncated last_member_type_id in btf_struct_resolve" This reverts commit d6cd1eac41b10e606ec7f445162a0617c01be973. commit caae5c99a3ca7bed0e318b31b6aa7ca8260a1c52 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:58 2024 -0500 Revert "BACKPORT: net: bpf: rename ndo_xdp to ndo_bpf" This reverts commit 2a1ddcb6a384745195d57b4e4cdda2a55d2cbe47. commit f90bdcdaa095a4f10268bb740470a3e0893be21b Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:54 2024 -0500 Revert "BACKPORT: bpf: offload: add infrastructure for loading programs for a specific netdev" This reverts commit a9516d402726094eafccce26a99cf5110d188be9. commit c6e0ce9019c06d9a45c030a2bc38eed320afd45a Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:50 2024 -0500 Revert "bpf: offload: rename the ifindex field" This reverts commit 36bc9c7351a1dc78b3e71571998af381e876b4cb. commit 88b6a4d41b69df804b846a8ebdca410517e08343 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:46 2024 -0500 Revert "BACKPORT: bpf: Check attach type at prog load time" This reverts commit fe5a0d514e4970d86983458136d4a2f6caeee365. commit 9ccfaa66a5ea042331f0aacdb3667e23c8ed363e Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:43 2024 -0500 Revert "BACKPORT: bpf: introduce BPF_PROG_QUERY command" This reverts commit a5720688858170f1054f9549b5a628db1c252a88. commit adab2743b3fa0853d0351b33b0a286de745025e5 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:37 2024 -0500 Revert "BACKPORT: bpf: Hooks for sys_bind" This reverts commit e484887c7e7aa026521ddc1773233368a6304b24. commit d462e09db98ad89b3a836f9b9a925812b0d8cfe7 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:33 2024 -0500 Revert "BACKPORT: net: Introduce __inet_bind() and __inet6_bind" This reverts commit 41a3131c3e94c28fd084dd6f4358baee3824fd17. commit cdf7f55dc65b4bdf7ecfc924be77c6a039709b3d Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:29 2024 -0500 Revert "BACKPORT: bpf: Hooks for sys_connect" This reverts commit f26fe7233e2885ef489707ab5a5a5dda9f081b80. commit 97685d5058f76ba4ea6dd2db157f4537f3a8953d Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:23 2024 -0500 Revert "BACKPORT: bpf: Post-hooks for sys_bind" This reverts commit 284ac5bc7c70dac338301445e94e1ad40fb40fdb. commit d03d9c05036d3109eae643f473cc5a5ad0a80721 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:19 2024 -0500 Revert "kernel: bpf: devmap: Create __dev_map_alloc_node" This reverts commit db726149fa9abfd1ca9add3e2db6b1524f7e90a3. commit 8c34bcb3e4c6630799764871b4af2e5f9344a371 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:15 2024 -0500 Revert "BACKPORT: xdp: Add devmap_hash map type for looking up devices by hashed index" This reverts commit c4d4e1d201d8433e06b2ac66041d7105095a0204. commit ef277c7b3a08fd59943eb2b47af64afc513de008 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:11 2024 -0500 Revert "BACKPORT: devmap: Allow map lookups from eBPF" This reverts commit 24d196375871c72de0de977de79afede5a7d1780. commit 4fcd87869c55c28ed59bff916d640147601816d2 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:07 2024 -0500 Revert "gen_headers_{arm, arm64}: Add btf.h to the list" This reverts commit 37edfe7c90bac355885ffec3327b338a34619792. commit b89560e0b405b58ecc5fc12c15ad4f56147760d6 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:55:03 2024 -0500 Revert "syscall: Fake uname to 4.19 for bpfloader/netd" This reverts commit 186e74af61269602d0c068d98928b1f25e03eba2. commit fd49f8c35eb7875d6810a5a52877ebc59bfd4530 Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:54:59 2024 -0500 Revert "syscall: Fake uname to 4.19 also for netbpfload" This reverts commit 34b9a1ab387d7dc83ede613b2c12b3741ea08edb. commit b853fcf2ff892664d0ff522ca7fd530bc94c023e Author: John Galt <johngaltfirstrun@gmail.com> Date: Fri Dec 13 07:54:53 2024 -0500 Revert "syscall: Increase bpf fake uname to 5.4" This reverts commit 9cdc014e11b410a7f03d8c968a35ee0dd6a28fff.
421 lines
13 KiB
C
421 lines
13 KiB
C
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of version 2 of the GNU General Public
|
|
* License as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
|
|
/* Devmaps primary use is as a backend map for XDP BPF helper call
|
|
* bpf_redirect_map(). Because XDP is mostly concerned with performance we
|
|
* spent some effort to ensure the datapath with redirect maps does not use
|
|
* any locking. This is a quick note on the details.
|
|
*
|
|
* We have three possible paths to get into the devmap control plane bpf
|
|
* syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
|
|
* will invoke an update, delete, or lookup operation. To ensure updates and
|
|
* deletes appear atomic from the datapath side xchg() is used to modify the
|
|
* netdev_map array. Then because the datapath does a lookup into the netdev_map
|
|
* array (read-only) from an RCU critical section we use call_rcu() to wait for
|
|
* an rcu grace period before free'ing the old data structures. This ensures the
|
|
* datapath always has a valid copy. However, the datapath does a "flush"
|
|
* operation that pushes any pending packets in the driver outside the RCU
|
|
* critical section. Each bpf_dtab_netdev tracks these pending operations using
|
|
* an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
|
|
* until all bits are cleared indicating outstanding flush operations have
|
|
* completed.
|
|
*
|
|
* BPF syscalls may race with BPF program calls on any of the update, delete
|
|
* or lookup operations. As noted above the xchg() operation also keep the
|
|
* netdev_map consistent in this case. From the devmap side BPF programs
|
|
* calling into these operations are the same as multiple user space threads
|
|
* making system calls.
|
|
*
|
|
* Finally, any of the above may race with a netdev_unregister notifier. The
|
|
* unregister notifier must search for net devices in the map structure that
|
|
* contain a reference to the net device and remove them. This is a two step
|
|
* process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
|
|
* check to see if the ifindex is the same as the net_device being removed.
|
|
* When removing the dev a cmpxchg() is used to ensure the correct dev is
|
|
* removed, in the case of a concurrent update or delete operation it is
|
|
* possible that the initially referenced dev is no longer in the map. As the
|
|
* notifier hook walks the map we know that new dev references can not be
|
|
* added by the user because core infrastructure ensures dev_get_by_index()
|
|
* calls will fail at this point.
|
|
*/
|
|
#include <linux/bpf.h>
|
|
#include <linux/filter.h>
|
|
|
|
#define DEV_CREATE_FLAG_MASK \
|
|
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
|
|
|
|
struct bpf_dtab_netdev {
|
|
struct net_device *dev;
|
|
struct bpf_dtab *dtab;
|
|
unsigned int bit;
|
|
struct rcu_head rcu;
|
|
};
|
|
|
|
struct bpf_dtab {
|
|
struct bpf_map map;
|
|
struct bpf_dtab_netdev **netdev_map;
|
|
unsigned long __percpu *flush_needed;
|
|
struct list_head list;
|
|
};
|
|
|
|
static DEFINE_SPINLOCK(dev_map_lock);
|
|
static LIST_HEAD(dev_map_list);
|
|
|
|
static u64 dev_map_bitmap_size(const union bpf_attr *attr)
|
|
{
|
|
return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
|
|
}
|
|
|
|
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
|
|
{
|
|
struct bpf_dtab *dtab;
|
|
int err = -EINVAL;
|
|
u64 cost;
|
|
|
|
if (!capable(CAP_NET_ADMIN))
|
|
return ERR_PTR(-EPERM);
|
|
|
|
/* check sanity of attributes */
|
|
if (attr->max_entries == 0 || attr->key_size != 4 ||
|
|
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
dtab = kzalloc(sizeof(*dtab), GFP_USER);
|
|
if (!dtab)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
/* mandatory map attributes */
|
|
dtab->map.map_type = attr->map_type;
|
|
dtab->map.key_size = attr->key_size;
|
|
dtab->map.value_size = attr->value_size;
|
|
dtab->map.max_entries = attr->max_entries;
|
|
dtab->map.map_flags = attr->map_flags;
|
|
dtab->map.numa_node = bpf_map_attr_numa_node(attr);
|
|
|
|
/* make sure page count doesn't overflow */
|
|
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
|
|
cost += dev_map_bitmap_size(attr) * num_possible_cpus();
|
|
if (cost >= U32_MAX - PAGE_SIZE)
|
|
goto free_dtab;
|
|
|
|
dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
|
|
|
|
/* if map size is larger than memlock limit, reject it early */
|
|
err = bpf_map_precharge_memlock(dtab->map.pages);
|
|
if (err)
|
|
goto free_dtab;
|
|
|
|
err = -ENOMEM;
|
|
|
|
/* A per cpu bitfield with a bit per possible net device */
|
|
dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
|
|
__alignof__(unsigned long),
|
|
GFP_KERNEL | __GFP_NOWARN);
|
|
if (!dtab->flush_needed)
|
|
goto free_dtab;
|
|
|
|
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
|
|
sizeof(struct bpf_dtab_netdev *),
|
|
dtab->map.numa_node);
|
|
if (!dtab->netdev_map)
|
|
goto free_dtab;
|
|
|
|
spin_lock(&dev_map_lock);
|
|
list_add_tail_rcu(&dtab->list, &dev_map_list);
|
|
spin_unlock(&dev_map_lock);
|
|
|
|
return &dtab->map;
|
|
free_dtab:
|
|
free_percpu(dtab->flush_needed);
|
|
kfree(dtab);
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
static void dev_map_free(struct bpf_map *map)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
int i, cpu;
|
|
|
|
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
|
|
* so the programs (can be more than one that used this map) were
|
|
* disconnected from events. Wait for outstanding critical sections in
|
|
* these programs to complete. The rcu critical section only guarantees
|
|
* no further reads against netdev_map. It does __not__ ensure pending
|
|
* flush operations (if any) are complete.
|
|
*/
|
|
|
|
spin_lock(&dev_map_lock);
|
|
list_del_rcu(&dtab->list);
|
|
spin_unlock(&dev_map_lock);
|
|
|
|
synchronize_rcu();
|
|
|
|
/* Make sure prior __dev_map_entry_free() have completed. */
|
|
rcu_barrier();
|
|
|
|
/* To ensure all pending flush operations have completed wait for flush
|
|
* bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
|
|
* Because the above synchronize_rcu() ensures the map is disconnected
|
|
* from the program we can assume no new bits will be set.
|
|
*/
|
|
for_each_online_cpu(cpu) {
|
|
unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
|
|
|
|
while (!bitmap_empty(bitmap, dtab->map.max_entries))
|
|
cond_resched();
|
|
}
|
|
|
|
for (i = 0; i < dtab->map.max_entries; i++) {
|
|
struct bpf_dtab_netdev *dev;
|
|
|
|
dev = dtab->netdev_map[i];
|
|
if (!dev)
|
|
continue;
|
|
|
|
dev_put(dev->dev);
|
|
kfree(dev);
|
|
}
|
|
|
|
free_percpu(dtab->flush_needed);
|
|
bpf_map_area_free(dtab->netdev_map);
|
|
kfree(dtab);
|
|
}
|
|
|
|
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
u32 index = key ? *(u32 *)key : U32_MAX;
|
|
u32 *next = next_key;
|
|
|
|
if (index >= dtab->map.max_entries) {
|
|
*next = 0;
|
|
return 0;
|
|
}
|
|
|
|
if (index == dtab->map.max_entries - 1)
|
|
return -ENOENT;
|
|
*next = index + 1;
|
|
return 0;
|
|
}
|
|
|
|
void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
|
|
|
|
__set_bit(bit, bitmap);
|
|
}
|
|
|
|
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
|
|
* from the driver before returning from its napi->poll() routine. The poll()
|
|
* routine is called either from busy_poll context or net_rx_action signaled
|
|
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
|
|
* net device can be torn down. On devmap tear down we ensure the ctx bitmap
|
|
* is zeroed before completing to ensure all flush operations have completed.
|
|
*/
|
|
void __dev_map_flush(struct bpf_map *map)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
|
|
u32 bit;
|
|
|
|
for_each_set_bit(bit, bitmap, map->max_entries) {
|
|
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
|
|
struct net_device *netdev;
|
|
|
|
/* This is possible if the dev entry is removed by user space
|
|
* between xdp redirect and flush op.
|
|
*/
|
|
if (unlikely(!dev))
|
|
continue;
|
|
|
|
__clear_bit(bit, bitmap);
|
|
netdev = dev->dev;
|
|
if (likely(netdev->netdev_ops->ndo_xdp_flush))
|
|
netdev->netdev_ops->ndo_xdp_flush(netdev);
|
|
}
|
|
}
|
|
|
|
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
|
|
* update happens in parallel here a dev_put wont happen until after reading the
|
|
* ifindex.
|
|
*/
|
|
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
struct bpf_dtab_netdev *dev;
|
|
|
|
if (key >= map->max_entries)
|
|
return NULL;
|
|
|
|
dev = READ_ONCE(dtab->netdev_map[key]);
|
|
return dev ? dev->dev : NULL;
|
|
}
|
|
|
|
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
|
|
{
|
|
struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
|
|
|
|
return dev ? &dev->ifindex : NULL;
|
|
}
|
|
|
|
static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
|
|
{
|
|
if (dev->dev->netdev_ops->ndo_xdp_flush) {
|
|
struct net_device *fl = dev->dev;
|
|
unsigned long *bitmap;
|
|
int cpu;
|
|
|
|
for_each_online_cpu(cpu) {
|
|
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
|
|
__clear_bit(dev->bit, bitmap);
|
|
|
|
fl->netdev_ops->ndo_xdp_flush(dev->dev);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void __dev_map_entry_free(struct rcu_head *rcu)
|
|
{
|
|
struct bpf_dtab_netdev *dev;
|
|
|
|
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
|
|
dev_map_flush_old(dev);
|
|
dev_put(dev->dev);
|
|
kfree(dev);
|
|
}
|
|
|
|
static int dev_map_delete_elem(struct bpf_map *map, void *key)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
struct bpf_dtab_netdev *old_dev;
|
|
int k = *(u32 *)key;
|
|
|
|
if (k >= map->max_entries)
|
|
return -EINVAL;
|
|
|
|
/* Use call_rcu() here to ensure any rcu critical sections have
|
|
* completed, but this does not guarantee a flush has happened
|
|
* yet. Because driver side rcu_read_lock/unlock only protects the
|
|
* running XDP program. However, for pending flush operations the
|
|
* dev and ctx are stored in another per cpu map. And additionally,
|
|
* the driver tear down ensures all soft irqs are complete before
|
|
* removing the net device in the case of dev_put equals zero.
|
|
*/
|
|
old_dev = xchg(&dtab->netdev_map[k], NULL);
|
|
if (old_dev)
|
|
call_rcu(&old_dev->rcu, __dev_map_entry_free);
|
|
return 0;
|
|
}
|
|
|
|
static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
|
|
u64 map_flags)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
struct net *net = current->nsproxy->net_ns;
|
|
struct bpf_dtab_netdev *dev, *old_dev;
|
|
u32 i = *(u32 *)key;
|
|
u32 ifindex = *(u32 *)value;
|
|
|
|
if (unlikely(map_flags > BPF_EXIST))
|
|
return -EINVAL;
|
|
if (unlikely(i >= dtab->map.max_entries))
|
|
return -E2BIG;
|
|
if (unlikely(map_flags == BPF_NOEXIST))
|
|
return -EEXIST;
|
|
|
|
if (!ifindex) {
|
|
dev = NULL;
|
|
} else {
|
|
dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
|
|
map->numa_node);
|
|
if (!dev)
|
|
return -ENOMEM;
|
|
|
|
dev->dev = dev_get_by_index(net, ifindex);
|
|
if (!dev->dev) {
|
|
kfree(dev);
|
|
return -EINVAL;
|
|
}
|
|
|
|
dev->bit = i;
|
|
dev->dtab = dtab;
|
|
}
|
|
|
|
/* Use call_rcu() here to ensure rcu critical sections have completed
|
|
* Remembering the driver side flush operation will happen before the
|
|
* net device is removed.
|
|
*/
|
|
old_dev = xchg(&dtab->netdev_map[i], dev);
|
|
if (old_dev)
|
|
call_rcu(&old_dev->rcu, __dev_map_entry_free);
|
|
|
|
return 0;
|
|
}
|
|
|
|
const struct bpf_map_ops dev_map_ops = {
|
|
.map_alloc = dev_map_alloc,
|
|
.map_free = dev_map_free,
|
|
.map_get_next_key = dev_map_get_next_key,
|
|
.map_lookup_elem = dev_map_lookup_elem,
|
|
.map_update_elem = dev_map_update_elem,
|
|
.map_delete_elem = dev_map_delete_elem,
|
|
};
|
|
|
|
static int dev_map_notification(struct notifier_block *notifier,
|
|
ulong event, void *ptr)
|
|
{
|
|
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
|
|
struct bpf_dtab *dtab;
|
|
int i;
|
|
|
|
switch (event) {
|
|
case NETDEV_UNREGISTER:
|
|
/* This rcu_read_lock/unlock pair is needed because
|
|
* dev_map_list is an RCU list AND to ensure a delete
|
|
* operation does not free a netdev_map entry while we
|
|
* are comparing it against the netdev being unregistered.
|
|
*/
|
|
rcu_read_lock();
|
|
list_for_each_entry_rcu(dtab, &dev_map_list, list) {
|
|
for (i = 0; i < dtab->map.max_entries; i++) {
|
|
struct bpf_dtab_netdev *dev, *odev;
|
|
|
|
dev = READ_ONCE(dtab->netdev_map[i]);
|
|
if (!dev || netdev != dev->dev)
|
|
continue;
|
|
odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
|
|
if (dev == odev)
|
|
call_rcu(&dev->rcu,
|
|
__dev_map_entry_free);
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return NOTIFY_OK;
|
|
}
|
|
|
|
static struct notifier_block dev_map_notifier = {
|
|
.notifier_call = dev_map_notification,
|
|
};
|
|
|
|
static int __init dev_map_init(void)
|
|
{
|
|
register_netdevice_notifier(&dev_map_notifier);
|
|
return 0;
|
|
}
|
|
|
|
subsys_initcall(dev_map_init);
|