From f8e030dd8abb1022e76a3c0c8e2771ca74591ee8 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:11 -0800 Subject: [PATCH 01/16] bpf: Convert bpf_selem_unlink_map to failable To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock, convert bpf_selem_unlink_map() to failable. It still always succeeds and returns 0 for now. Since some operations updating local storage cannot fail in the middle, open-code bpf_selem_unlink_map() to take the b->lock before the operation. There are two such locations: - bpf_local_storage_alloc() The first selem will be unlinked from smap if cmpxchg owner_storage_ptr fails, which should not fail. Therefore, hold b->lock when linking until allocation complete. Helpers that assume b->lock is held by callers are introduced: bpf_selem_link_map_nolock() and bpf_selem_unlink_map_nolock(). - bpf_local_storage_update() The three step update process: link_map(new_selem), link_storage(new_selem), and unlink_map(old_selem) should not fail in the middle. In bpf_selem_unlink(), bpf_selem_unlink_map() and bpf_selem_unlink_storage() should either all succeed or fail as a whole instead of failing in the middle. So, return if unlink_map() failed. In bpf_local_storage_destroy(), since it cannot deadlock with itself or bpf_local_storage_map_free() who the function might be racing with, retry if bpf_selem_unlink_map() fails due to rqspinlock returning errors. Signed-off-by: Amery Hung --- kernel/bpf/bpf_local_storage.c | 64 +++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index e2fe6c32822b8..4e3f227fd6341 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -347,7 +347,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, hlist_add_head_rcu(&selem->snode, &local_storage->list); } -static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; @@ -355,7 +355,7 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) if (unlikely(!selem_linked_to_map_lockless(selem))) /* selem has already be unlinked from smap */ - return; + return 0; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, selem); @@ -363,6 +363,14 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); raw_spin_unlock_irqrestore(&b->lock, flags); + + return 0; +} + +static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) +{ + if (likely(selem_linked_to_map(selem))) + hlist_del_init_rcu(&selem->map_node); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, @@ -376,13 +384,26 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_irqrestore(&b->lock, flags); } +static void bpf_selem_link_map_nolock(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map_bucket *b) +{ + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + hlist_add_head_rcu(&selem->map_node, &b->list); +} + void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { + int err; + /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ - bpf_selem_unlink_map(selem); + err = bpf_selem_unlink_map(selem); + if (err) + return; + bpf_selem_unlink_storage(selem, reuse_now); } @@ -424,6 +445,8 @@ int bpf_local_storage_alloc(void *owner, { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; + struct bpf_local_storage_map_bucket *b; + unsigned long flags; int err; err = mem_charge(smap, owner, sizeof(*storage)); @@ -448,7 +471,10 @@ int bpf_local_storage_alloc(void *owner, storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); - bpf_selem_link_map(smap, first_selem); + + b = select_bucket(smap, first_selem); + raw_spin_lock_irqsave(&b->lock, flags); + bpf_selem_link_map_nolock(smap, first_selem, b); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); @@ -464,10 +490,12 @@ int bpf_local_storage_alloc(void *owner, */ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { - bpf_selem_unlink_map(first_selem); + bpf_selem_unlink_map_nolock(first_selem); + raw_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; } + raw_spin_unlock_irqrestore(&b->lock, flags); return 0; @@ -488,9 +516,10 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; + struct bpf_local_storage_map_bucket *b, *old_b = NULL; + unsigned long flags, b_flags, old_b_flags; struct bpf_local_storage *local_storage; HLIST_HEAD(old_selem_free_list); - unsigned long flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ @@ -574,20 +603,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } + b = select_bucket(smap, selem); + + if (old_sdata) { + old_b = select_bucket(smap, SELEM(old_sdata)); + old_b = old_b == b ? NULL : old_b; + } + + raw_spin_lock_irqsave(&b->lock, b_flags); + + if (old_b) + raw_spin_lock_irqsave(&old_b->lock, old_b_flags); + alloc_selem = NULL; /* First, link the new selem to the map */ - bpf_selem_link_map(smap, selem); + bpf_selem_link_map_nolock(smap, selem, b); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); /* Third, remove old selem, SELEM(old_sdata) */ if (old_sdata) { - bpf_selem_unlink_map(SELEM(old_sdata)); + bpf_selem_unlink_map_nolock(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), &old_selem_free_list); } + if (old_b) + raw_spin_unlock_irqrestore(&old_b->lock, old_b_flags); + + raw_spin_unlock_irqrestore(&b->lock, b_flags); + unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&old_selem_free_list, false); @@ -679,7 +725,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) /* Always unlink from map before unlinking from * local_storage. */ - bpf_selem_unlink_map(selem); + WARN_ON(bpf_selem_unlink_map(selem)); /* If local_storage list has only one element, the * bpf_selem_unlink_storage_nolock() will return true. * Otherwise, it will return false. The current loop iteration From fc8d474df82590e0ef7231cdc86defa674ad3786 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:12 -0800 Subject: [PATCH 02/16] bpf: Convert bpf_selem_link_map to failable To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock, convert bpf_selem_link_map() to failable. It still always succeeds and returns 0 until the change happens. No functional change. __must_check is added to the function declaration locally to make sure all the callers are accounted for during the conversion. Signed-off-by: Amery Hung --- include/linux/bpf_local_storage.h | 4 ++-- kernel/bpf/bpf_local_storage.c | 6 ++++-- net/core/bpf_sk_storage.c | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 66432248cd810..6cabf5154cf62 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -178,8 +178,8 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *selem); +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 4e3f227fd6341..94a20c147bc76 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -373,8 +373,8 @@ static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) hlist_del_init_rcu(&selem->map_node); } -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *selem) +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); unsigned long flags; @@ -382,6 +382,8 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_lock_irqsave(&b->lock, flags); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); + + return 0; } static void bpf_selem_link_map_nolock(struct bpf_local_storage_map *smap, diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 850dd736ccd12..4f8e917f49d9e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -191,7 +191,9 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - bpf_selem_link_map(smap, copy_selem); + ret = bpf_selem_link_map(smap, copy_selem); + if (ret) + goto out; bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); From d6cd457a5a5fe175a7472839423ea0209dbc2f8f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:13 -0800 Subject: [PATCH 03/16] bpf: Open code bpf_selem_unlink_storage in bpf_selem_unlink To prepare for changing bpf_local_storage::lock to rqspinlock, open code bpf_selem_unlink_storage() in the only caller, bpf_selem_unlink(), since unlink_map and unlink_storage must be done together after all the necessary locks are acquired. Signed-off-by: Amery Hung --- kernel/bpf/bpf_local_storage.c | 63 ++++++++++++++++------------------ 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 94a20c147bc76..0e3fa5fbaaf35 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -313,33 +313,6 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } -static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, - bool reuse_now) -{ - struct bpf_local_storage *local_storage; - bool free_local_storage = false; - HLIST_HEAD(selem_free_list); - unsigned long flags; - - if (unlikely(!selem_linked_to_storage_lockless(selem))) - /* selem has already been unlinked from sk */ - return; - - local_storage = rcu_dereference_check(selem->local_storage, - bpf_rcu_lock_held()); - - raw_spin_lock_irqsave(&local_storage->lock, flags); - if (likely(selem_linked_to_storage(selem))) - free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &selem_free_list); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - - bpf_selem_free_list(&selem_free_list, reuse_now); - - if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); -} - void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { @@ -396,17 +369,39 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map *smap, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { + struct bpf_local_storage *local_storage; + bool free_local_storage = false; + HLIST_HEAD(selem_free_list); + unsigned long flags; int err; - /* Always unlink from map before unlinking from local_storage - * because selem will be freed after successfully unlinked from - * the local_storage. - */ - err = bpf_selem_unlink_map(selem); - if (err) + if (unlikely(!selem_linked_to_storage_lockless(selem))) + /* selem has already been unlinked from sk */ return; - bpf_selem_unlink_storage(selem, reuse_now); + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); + + raw_spin_lock_irqsave(&local_storage->lock, flags); + if (likely(selem_linked_to_storage(selem))) { + /* Always unlink from map before unlinking from local_storage + * because selem will be freed after successfully unlinked from + * the local_storage. + */ + err = bpf_selem_unlink_map(selem); + if (err) + goto out; + + free_local_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, &selem_free_list); + } +out: + raw_spin_unlock_irqrestore(&local_storage->lock, flags); + + bpf_selem_free_list(&selem_free_list, reuse_now); + + if (free_local_storage) + bpf_local_storage_free(local_storage, reuse_now); } void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, From 01c6b7efdc473c3ee26002f8a116a607de3b9277 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:14 -0800 Subject: [PATCH 04/16] bpf: Convert bpf_selem_unlink to failable To prepare changing both bpf_local_storage_map_bucket::lock and bpf_local_storage::lock to rqspinlock, convert bpf_selem_unlink() to failable. It still always succeeds and returns 0 until the change happens. No functional change. For bpf_local_storage_map_free(), WARN_ON() for now as no real error will happen until we switch to rqspinlock. __must_check is added to the function declaration locally to make sure all callers are accounted for during the conversion. Signed-off-by: Amery Hung --- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/bpf_cgrp_storage.c | 3 +-- kernel/bpf/bpf_inode_storage.c | 4 +--- kernel/bpf/bpf_local_storage.c | 8 +++++--- kernel/bpf/bpf_task_storage.c | 4 +--- net/core/bpf_sk_storage.c | 4 +--- 6 files changed, 10 insertions(+), 15 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 6cabf5154cf62..a94e12ddd83de 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 0687a760974a4..8fef24fcac682 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -118,8 +118,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e54cce2b91754..cedc99184dadd 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -110,9 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 0e3fa5fbaaf35..fa629a180e9e3 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -367,7 +367,7 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map *smap, hlist_add_head_rcu(&selem->map_node, &b->list); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -377,7 +377,7 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ - return; + return 0; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); @@ -402,6 +402,8 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) if (free_local_storage) bpf_local_storage_free(local_storage, reuse_now); + + return err; } void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, @@ -837,7 +839,7 @@ void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_elem, map_node))) { if (busy_counter) this_cpu_inc(*busy_counter); - bpf_selem_unlink(selem, true); + WARN_ON(bpf_selem_unlink(selem, true)); if (busy_counter) this_cpu_dec(*busy_counter); cond_resched_rcu(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index a1dc1bf0848a5..ab902364ac23f 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -167,9 +167,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!nobusy) return -EBUSY; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 4f8e917f49d9e..fb1f041352a5b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,9 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ From b33c14aca15a4428f29e4f76b6ebc2b3a8b95d25 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:15 -0800 Subject: [PATCH 05/16] bpf: Change local_storage->lock and b->lock to rqspinlock Change bpf_local_storage::lock and bpf_local_storage_map_bucket::lock to from raw_spin_lock to rqspinlock. Finally, propagate errors from raw_res_spin_lock_irqsave() to syscall return or BPF helper return. In bpf_local_storage_destroy(), WARN_ON for now. A later patch will handle this properly. For, __bpf_local_storage_map_cache(), instead of handling the error, skip updating the cache. Signed-off-by: Amery Hung --- include/linux/bpf_local_storage.h | 5 ++- kernel/bpf/bpf_local_storage.c | 72 ++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 26 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a94e12ddd83de..903559e2ca910 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -15,12 +15,13 @@ #include #include #include +#include #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 struct bpf_local_storage_map_bucket { struct hlist_head list; - raw_spinlock_t lock; + rqspinlock_t lock; }; /* Thp map is not the primary owner of a bpf_local_storage_elem. @@ -94,7 +95,7 @@ struct bpf_local_storage { * bpf_local_storage_elem. */ struct rcu_head rcu; - raw_spinlock_t lock; /* Protect adding/removing from the "list" */ + rqspinlock_t lock; /* Protect adding/removing from the "list" */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index fa629a180e9e3..1d21ec11c80ec 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -325,6 +325,7 @@ static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; + int err; if (unlikely(!selem_linked_to_map_lockless(selem))) /* selem has already be unlinked from smap */ @@ -332,10 +333,13 @@ static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, selem); - raw_spin_lock_irqsave(&b->lock, flags); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } @@ -351,10 +355,14 @@ int bpf_selem_link_map(struct bpf_local_storage_map *smap, { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); unsigned long flags; + int err; + + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; - raw_spin_lock_irqsave(&b->lock, flags); hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } @@ -382,7 +390,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return err; + if (likely(selem_linked_to_storage(selem))) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from @@ -396,7 +407,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) local_storage, selem, &selem_free_list); } out: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, reuse_now); @@ -411,16 +422,20 @@ void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { unsigned long flags; + int err; /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return; + if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); } static int check_flags(const struct bpf_local_storage_data *old_sdata, @@ -465,14 +480,17 @@ int bpf_local_storage_alloc(void *owner, RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); - raw_spin_lock_init(&storage->lock); + raw_res_spin_lock_init(&storage->lock); storage->owner = owner; storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); b = select_bucket(smap, first_selem); - raw_spin_lock_irqsave(&b->lock, flags); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + goto uncharge; + bpf_selem_link_map_nolock(smap, first_selem, b); owner_storage_ptr = @@ -490,11 +508,11 @@ int bpf_local_storage_alloc(void *owner, prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { bpf_selem_unlink_map_nolock(first_selem); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; } - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; @@ -577,7 +595,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (!alloc_selem) return ERR_PTR(-ENOMEM); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return ERR_PTR(err); /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -609,10 +629,15 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, old_b = old_b == b ? NULL : old_b; } - raw_spin_lock_irqsave(&b->lock, b_flags); + err = raw_res_spin_lock_irqsave(&b->lock, b_flags); + if (err) + goto unlock; - if (old_b) - raw_spin_lock_irqsave(&old_b->lock, old_b_flags); + if (old_b) { + err = raw_res_spin_lock_irqsave(&old_b->lock, old_b_flags); + if (err) + goto unlock_b; + } alloc_selem = NULL; /* First, link the new selem to the map */ @@ -629,12 +654,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } if (old_b) - raw_spin_unlock_irqrestore(&old_b->lock, old_b_flags); - - raw_spin_unlock_irqrestore(&b->lock, b_flags); - + raw_res_spin_unlock_irqrestore(&old_b->lock, old_b_flags); +unlock_b: + raw_res_spin_unlock_irqrestore(&b->lock, b_flags); unlock: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); @@ -719,7 +743,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + WARN_ON(raw_res_spin_lock_irqsave(&local_storage->lock, flags)); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -734,7 +758,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) free_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, &free_selem_list); } - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&free_selem_list, true); @@ -781,7 +805,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); + raw_res_spin_lock_init(&smap->buckets[i].lock); } smap->elem_size = offsetof(struct bpf_local_storage_elem, From 60dc81cfddea68133298806e92a0a67a4870017d Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:16 -0800 Subject: [PATCH 06/16] bpf: Remove task local storage percpu counter The percpu counter in task local storage is no longer needed as the underlying bpf_local_storage can now handle deadlock with the help of rqspinlock. Remove the percpu counter and related migrate_{disable, enable}. Since the percpu counter is removed, merge back bpf_task_storage_get() and bpf_task_storage_get_recur(). This will allow the bpf syscalls and helpers to run concurrently on the same CPU, removing the spurious -EBUSY error. bpf_task_storage_get(..., F_CREATE) will now always succeed with enough free memory unless being called recursively. Signed-off-by: Amery Hung --- kernel/bpf/bpf_task_storage.c | 150 ++++------------------------------ kernel/bpf/helpers.c | 4 - 2 files changed, 18 insertions(+), 136 deletions(-) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index ab902364ac23f..dd858226ada2d 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -20,29 +20,6 @@ DEFINE_BPF_STORAGE_CACHE(task_cache); -static DEFINE_PER_CPU(int, bpf_task_storage_busy); - -static void bpf_task_storage_lock(void) -{ - cant_migrate(); - this_cpu_inc(bpf_task_storage_busy); -} - -static void bpf_task_storage_unlock(void) -{ - this_cpu_dec(bpf_task_storage_busy); -} - -static bool bpf_task_storage_trylock(void) -{ - cant_migrate(); - if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { - this_cpu_dec(bpf_task_storage_busy); - return false; - } - return true; -} - static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; @@ -70,17 +47,15 @@ void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; - rcu_read_lock_dont_migrate(); + rcu_read_lock(); local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) goto out; - bpf_task_storage_lock(); bpf_local_storage_destroy(local_storage); - bpf_task_storage_unlock(); out: - rcu_read_unlock_migrate(); + rcu_read_unlock(); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) @@ -106,9 +81,7 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) goto out; } - bpf_task_storage_lock(); sdata = task_storage_lookup(task, map, true); - bpf_task_storage_unlock(); put_pid(pid); return sdata ? sdata->data : NULL; out: @@ -143,11 +116,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, goto out; } - bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, true, GFP_ATOMIC); - bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); out: @@ -155,8 +126,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, return err; } -static int task_storage_delete(struct task_struct *task, struct bpf_map *map, - bool nobusy) +static int task_storage_delete(struct task_struct *task, struct bpf_map *map) { struct bpf_local_storage_data *sdata; @@ -164,9 +134,6 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!sdata) return -ENOENT; - if (!nobusy) - return -EBUSY; - return bpf_selem_unlink(SELEM(sdata), false); } @@ -192,111 +159,50 @@ static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) goto out; } - bpf_task_storage_lock(); - err = task_storage_delete(task, map, true); - bpf_task_storage_unlock(); + err = task_storage_delete(task, map); out: put_pid(pid); return err; } -/* Called by bpf_task_storage_get*() helpers */ -static void *__bpf_task_storage_get(struct bpf_map *map, - struct task_struct *task, void *value, - u64 flags, gfp_t gfp_flags, bool nobusy) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; - sdata = task_storage_lookup(task, map, nobusy); + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) + return (unsigned long)NULL; + + sdata = task_storage_lookup(task, map, true); if (sdata) - return sdata->data; + return (unsigned long)sdata->data; /* only allocate new storage, when the task is refcounted */ if (refcount_read(&task->usage) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); - return IS_ERR(sdata) ? NULL : sdata->data; + return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } - return NULL; -} - -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags, gfp_t, gfp_flags) -{ - bool nobusy; - void *data; - - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) - return (unsigned long)NULL; - - nobusy = bpf_task_storage_trylock(); - data = __bpf_task_storage_get(map, task, value, flags, - gfp_flags, nobusy); - if (nobusy) - bpf_task_storage_unlock(); - return (unsigned long)data; -} - -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags, gfp_t, gfp_flags) -{ - void *data; - - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) - return (unsigned long)NULL; - - bpf_task_storage_lock(); - data = __bpf_task_storage_get(map, task, value, flags, - gfp_flags, true); - bpf_task_storage_unlock(); - return (unsigned long)data; -} - -BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *, - task) -{ - bool nobusy; - int ret; - - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (!task) - return -EINVAL; - - nobusy = bpf_task_storage_trylock(); - /* This helper must only be called from places where the lifetime of the task - * is guaranteed. Either by being refcounted or by being protected - * by an RCU read-side critical section. - */ - ret = task_storage_delete(task, map, nobusy); - if (nobusy) - bpf_task_storage_unlock(); - return ret; + return (unsigned long)NULL; } BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { - int ret; - WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; - bpf_task_storage_lock(); /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - ret = task_storage_delete(task, map, true); - bpf_task_storage_unlock(); - return ret; + return task_storage_delete(task, map); } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -311,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) static void task_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); + bpf_local_storage_map_free(map, &task_cache, NULL); } BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) @@ -330,17 +236,6 @@ const struct bpf_map_ops task_storage_map_ops = { .map_owner_storage_ptr = task_storage_ptr, }; -const struct bpf_func_proto bpf_task_storage_get_recur_proto = { - .func = bpf_task_storage_get_recur, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, - .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], - .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, - .arg4_type = ARG_ANYTHING, -}; - const struct bpf_func_proto bpf_task_storage_get_proto = { .func = bpf_task_storage_get, .gpl_only = false, @@ -352,15 +247,6 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .arg4_type = ARG_ANYTHING, }; -const struct bpf_func_proto bpf_task_storage_delete_recur_proto = { - .func = bpf_task_storage_delete_recur, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, - .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], -}; - const struct bpf_func_proto bpf_task_storage_delete_proto = { .func = bpf_task_storage_delete, .gpl_only = false, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index db72b96f9c8c8..33b470b9324d3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2092,12 +2092,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif case BPF_FUNC_task_storage_get: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_get_recur_proto; return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_delete_recur_proto; return &bpf_task_storage_delete_proto; default: break; From e60e823cffa75ab9c05e5a5673ca982945858727 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:17 -0800 Subject: [PATCH 07/16] bpf: Remove cgroup local storage percpu counter The percpu counter in cgroup local storage is no longer needed as the underlying bpf_local_storage can now handle deadlock with the help of rqspinlock. Remove the percpu counter and related migrate_{disable, enable}. Signed-off-by: Amery Hung --- kernel/bpf/bpf_cgrp_storage.c | 59 +++++------------------------------ 1 file changed, 8 insertions(+), 51 deletions(-) diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 8fef24fcac682..4d84611d82229 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -11,29 +11,6 @@ DEFINE_BPF_STORAGE_CACHE(cgroup_cache); -static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); - -static void bpf_cgrp_storage_lock(void) -{ - cant_migrate(); - this_cpu_inc(bpf_cgrp_storage_busy); -} - -static void bpf_cgrp_storage_unlock(void) -{ - this_cpu_dec(bpf_cgrp_storage_busy); -} - -static bool bpf_cgrp_storage_trylock(void) -{ - cant_migrate(); - if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { - this_cpu_dec(bpf_cgrp_storage_busy); - return false; - } - return true; -} - static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) { struct cgroup *cg = owner; @@ -45,16 +22,14 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; - rcu_read_lock_dont_migrate(); + rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; - bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); - bpf_cgrp_storage_unlock(); out: - rcu_read_unlock_migrate(); + rcu_read_unlock(); } static struct bpf_local_storage_data * @@ -83,9 +58,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) if (IS_ERR(cgroup)) return ERR_CAST(cgroup); - bpf_cgrp_storage_lock(); sdata = cgroup_storage_lookup(cgroup, map, true); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return sdata ? sdata->data : NULL; } @@ -102,10 +75,8 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, if (IS_ERR(cgroup)) return PTR_ERR(cgroup); - bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, map_flags, false, GFP_ATOMIC); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } @@ -131,9 +102,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) if (IS_ERR(cgroup)) return PTR_ERR(cgroup); - bpf_cgrp_storage_lock(); err = cgroup_storage_delete(cgroup, map); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return err; } @@ -150,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); + bpf_local_storage_map_free(map, &cgroup_cache, NULL); } /* *gfp_flags* is a hidden argument provided by the verifier */ @@ -158,7 +127,6 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; - bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) @@ -167,38 +135,27 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!cgroup) return (unsigned long)NULL; - nobusy = bpf_cgrp_storage_trylock(); - - sdata = cgroup_storage_lookup(cgroup, map, nobusy); + sdata = cgroup_storage_lookup(cgroup, map, true); if (sdata) - goto unlock; + goto out; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); -unlock: - if (nobusy) - bpf_cgrp_storage_unlock(); +out: return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) { - int ret; - WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!cgroup) return -EINVAL; - if (!bpf_cgrp_storage_trylock()) - return -EBUSY; - - ret = cgroup_storage_delete(cgroup, map); - bpf_cgrp_storage_unlock(); - return ret; + return cgroup_storage_delete(cgroup, map); } const struct bpf_map_ops cgrp_storage_map_ops = { From c086fc4f69509f77adaba9d1a541adf54ad5a67c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:18 -0800 Subject: [PATCH 08/16] bpf: Remove unused percpu counter from bpf_local_storage_map_free Percpu locks have been removed from cgroup and task local storage. Now that all local storage no longer use percpu variables as locks preventing recursion, there is no need to pass them to bpf_local_storage_map_free(). Remove the argument from the function. Signed-off-by: Amery Hung --- include/linux/bpf_local_storage.h | 3 +-- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 7 +------ kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 2 +- 6 files changed, 6 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 903559e2ca910..70b35dfc01c97 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -166,8 +166,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter); + struct bpf_local_storage_cache *cache); int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 4d84611d82229..853183eead2c2 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -119,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, NULL); + bpf_local_storage_map_free(map, &cgroup_cache); } /* *gfp_flags* is a hidden argument provided by the verifier */ diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index cedc99184dadd..470f4b02c79ea 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -184,7 +184,7 @@ static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) static void inode_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &inode_cache, NULL); + bpf_local_storage_map_free(map, &inode_cache); } const struct bpf_map_ops inode_storage_map_ops = { diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 1d21ec11c80ec..667b468652d17 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -827,8 +827,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, } void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter) + struct bpf_local_storage_cache *cache) { struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; @@ -861,11 +860,7 @@ void bpf_local_storage_map_free(struct bpf_map *map, while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { - if (busy_counter) - this_cpu_inc(*busy_counter); WARN_ON(bpf_selem_unlink(selem, true)); - if (busy_counter) - this_cpu_dec(*busy_counter); cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index dd858226ada2d..4d53aebe67848 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -217,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) static void task_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &task_cache, NULL); + bpf_local_storage_map_free(map, &task_cache); } BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index fb1f041352a5b..38acbecb8ef7f 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -60,7 +60,7 @@ void bpf_sk_storage_free(struct sock *sk) static void bpf_sk_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &sk_cache, NULL); + bpf_local_storage_map_free(map, &sk_cache); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) From c87bbceba917eb0bf206c6dc2e7d0fe35eb7302c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:19 -0800 Subject: [PATCH 09/16] bpf: Save memory allocation method and size in bpf_local_storage_elem A later patch will introduce bpf_selem_unlink_lockless() to handle rqspinlock errors. bpf_selem_unlink_lockless() will allow an selem to be partially unlinked from map or local storage. Therefore, bpf_selem_free() needs to be decoupled from map and local storage as SDATA(selem)->smap or selem->local_storage may be NULL. Decoupling from local storage is already done when local storage migrated from BPF memory allocator to kmalloc_nolock(). This patch prepares to decouple from map. Currently, map is still needed in bpf_selem_free() to: 1. Uncharge memory a. map->ops->map_local_storage_uncharge b. map->elem_size 2. Infer how memory should be freed a. map->use_kmalloc_nolock 3. Free special fields a. map->record The dependency of 1.a will be addressed by a later patch by returning the amount of memory to uncharge directly to the owner who calls bpf_local_storage_destroy(). The dependency of 3.a will be addressed by a later patch by freeing special fields under b->lock, when the map is still alive. This patch handles 1.b and 2.a by simply saving the informnation in bpf_local_storage_elem. Signed-off-by: Amery Hung --- include/linux/bpf_local_storage.h | 4 +++- kernel/bpf/bpf_local_storage.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 70b35dfc01c97..20918c31b7e57 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -80,7 +80,9 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; - /* 8 bytes hole */ + u16 size; + bool use_kmalloc_nolock; + /* 4 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 667b468652d17..62201552dca6a 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -97,6 +97,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (swap_uptrs) bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value); } + selem->size = smap->elem_size; + selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; return selem; } @@ -219,7 +221,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - if (!smap->use_kmalloc_nolock) { + if (!selem->use_kmalloc_nolock) { /* * No uptr will be unpin even when reuse_now == false since uptr * is only supported in task local storage, where From c8ffa684bc48f8a91e2bdd2ed743d0c0b561d835 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:20 -0800 Subject: [PATCH 10/16] bpf: Support lockless unlink when freeing map or local storage Introduce bpf_selem_unlink_lockless() to properly handle errors returned from rqspinlock in bpf_local_storage_map_free() and bpf_local_storage_destroy() where the operation must succeeds. The idea of bpf_selem_unlink_lockless() is to allow an selem to be partially linked and use refcount to determine when and who can free the selem. An selem initially is fully linked to a map and a local storage and therefore selem->link_cnt is set to 2. Under normal circumstances, bpf_selem_unlink_lockless() will be able to grab locks and unlink an selem from map and local storage in sequeunce, just like bpf_selem_unlink(), and then add it to a local tofree list provide by the caller. However, if any of the lock attempts fails, it will only clear SDATA(selem)->smap or selem->local_storage depending on the caller and decrement link_cnt to signal that the corresponding data structure holding a reference to the selem is gone. Then, only when both map and local storage are gone, an selem can be free by the last caller that turns link_cnt to 0. To make sure bpf_obj_free_fields() is done only once and when map is still present, it is called when unlinking an selem from b->list under b->lock. To make sure uncharging memory is only done once and when owner is still present, only unlink selem from local_storage->list in bpf_local_storage_destroy() and return the amount of memory to uncharge to the caller (i.e., owner) since the map associated with an selem may already be gone and map->ops->map_local_storage_uncharge can no longer be referenced. Finally, access of selem, SDATA(selem)->smap and selem->local_storage are racy. Callers will protect these fields with RCU. Co-developed-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: Amery Hung --- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/bpf_local_storage.c | 77 +++++++++++++++++++++++++++++-- 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 20918c31b7e57..1fd908c44fb6a 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -80,9 +80,9 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; + atomic_t link_cnt; u16 size; bool use_kmalloc_nolock; - /* 4 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 62201552dca6a..4c682d5aef7f8 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -97,6 +97,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (swap_uptrs) bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value); } + atomic_set(&selem->link_cnt, 2); selem->size = smap->elem_size; selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; return selem; @@ -200,9 +201,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - migrate_disable(); - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); + if (smap) { + migrate_disable(); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + migrate_enable(); + } kfree_nolock(selem); } @@ -227,7 +230,8 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, * is only supported in task local storage, where * smap->use_kmalloc_nolock == true. */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + if (smap) + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } @@ -419,6 +423,71 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) return err; } +/* Callers of bpf_selem_unlink_lockless() */ +#define BPF_LOCAL_STORAGE_MAP_FREE 0 +#define BPF_LOCAL_STORAGE_DESTROY 1 + +/* + * Unlink an selem from map and local storage with lockless fallback if callers + * are racing or rqspinlock returns error. It should only be called by + * bpf_local_storage_destroy() or bpf_local_storage_map_free(). + */ +static void bpf_selem_unlink_lockless(struct bpf_local_storage_elem *selem, + struct hlist_head *to_free, int caller) +{ + struct bpf_local_storage *local_storage; + struct bpf_local_storage_map_bucket *b; + struct bpf_local_storage_map *smap; + unsigned long flags; + int err, unlink = 0; + + local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + /* + * Free special fields immediately as SDATA(selem)->smap will be cleared. + * No BPF program should be reading the selem. + */ + if (smap) { + b = select_bucket(smap, selem); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (!err) { + if (likely(selem_linked_to_map(selem))) { + hlist_del_init_rcu(&selem->map_node); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + RCU_INIT_POINTER(SDATA(selem)->smap, NULL); + unlink++; + } + raw_res_spin_unlock_irqrestore(&b->lock, flags); + } else if (caller == BPF_LOCAL_STORAGE_MAP_FREE) { + RCU_INIT_POINTER(SDATA(selem)->smap, NULL); + } + } + + /* + * Only let destroy() unlink from local_storage->list and do mem_uncharge + * as owner is guaranteed to be valid in destroy(). + */ + if (local_storage && caller == BPF_LOCAL_STORAGE_DESTROY) { + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (!err) { + hlist_del_init_rcu(&selem->snode); + unlink++; + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); + } + RCU_INIT_POINTER(selem->local_storage, NULL); + } + + /* + * Normally, an selem can be unlink under local_storage->lock and b->lock, and + * then added to a local to_free list. However, if destroy() and map_free() are + * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free + * the selem only after both map_free() and destroy() drop the refcnt. + */ + if (unlink == 2 || atomic_dec_and_test(&selem->link_cnt)) + hlist_add_head(&selem->free_node, to_free); +} + void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) From edcc76e16d0f00d96c30a3b269712c8a810a8497 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:21 -0800 Subject: [PATCH 11/16] bpf: Switch to bpf_selem_unlink_lockless in bpf_local_storage_{map_free, destroy} Take care of rqspinlock error in bpf_local_storage_{map_free, destroy}() properly by switching to bpf_selem_unlink_lockless(). Pass reuse_now == false when calling bpf_selem_free_list() since both callers iterate lists of selem without lock. An selem can only be freed after an RCU grace period. Similarly, SDATA(selem)->smap and selem->local_storage need to be protected by RCU as well since a caller can update these fields which may also be seen by the other at the same time. Pass reuse_now == false when calling bpf_local_storage_free(). The local storage map is already protected as bpf_local_storage_map_free() waits for an RCU grace period after iterating b->list and before freeing itself. Co-developed-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: Amery Hung --- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/bpf_cgrp_storage.c | 1 + kernel/bpf/bpf_inode_storage.c | 1 + kernel/bpf/bpf_local_storage.c | 52 ++++++++++++++++++------------- kernel/bpf/bpf_task_storage.c | 1 + net/core/bpf_sk_storage.c | 7 ++++- 6 files changed, 41 insertions(+), 23 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 1fd908c44fb6a..14f8e5edf0a28 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -165,7 +165,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, return SDATA(selem); } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 853183eead2c2..9289b0c3fae9e 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -28,6 +28,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) goto out; bpf_local_storage_destroy(local_storage); + RCU_INIT_POINTER(cgroup->bpf_cgrp_storage, NULL); out: rcu_read_unlock(); } diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 470f4b02c79ea..120354ef0bf83 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -69,6 +69,7 @@ void bpf_inode_storage_free(struct inode *inode) goto out; bpf_local_storage_destroy(local_storage); + RCU_INIT_POINTER(bsb->storage, NULL); out: rcu_read_unlock_migrate(); } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 4c682d5aef7f8..f63b3c2241f09 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -797,13 +797,22 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) +/* + * Destroy local storage when the owner is going away. Caller must clear owner->storage + * and uncharge memory if memory charging is used. + * + * Since smaps associated with selems may already be gone, mem_uncharge() or + * owner_storage() cannot be called in this function. Let the owner (i.e., the caller) + * do it instead. It is safe for the caller to clear owner_storage without taking + * local_storage->lock as bpf_local_storage_map_free() does not free local_storage and + * no BPF program should be running and freeing the local storage. + */ +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_elem *selem; - bool free_storage = false; HLIST_HEAD(free_selem_list); struct hlist_node *n; - unsigned long flags; + u32 uncharge = 0; /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. @@ -814,27 +823,22 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - WARN_ON(raw_res_spin_lock_irqsave(&local_storage->lock, flags)); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - WARN_ON(bpf_selem_unlink_map(selem)); - /* If local_storage list has only one element, the - * bpf_selem_unlink_storage_nolock() will return true. - * Otherwise, it will return false. The current loop iteration - * intends to remove all local storage. So the last iteration - * of the loop will set the free_cgroup_storage to true. - */ - free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &free_selem_list); + uncharge += selem->size; + bpf_selem_unlink_lockless(selem, &free_selem_list, BPF_LOCAL_STORAGE_DESTROY); } - raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); + uncharge += sizeof(*local_storage); + local_storage->owner = NULL; - bpf_selem_free_list(&free_selem_list, true); + /* + * Need to wait an RCU gp before freeing selem and local_storage + * since bpf_local_storage_map_free() may still be referencing them. + */ + bpf_selem_free_list(&free_selem_list, false); + + bpf_local_storage_free(local_storage, false); - if (free_storage) - bpf_local_storage_free(local_storage, true); + return uncharge; } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -903,6 +907,7 @@ void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; + HLIST_HEAD(free_selem_list); unsigned int i; smap = (struct bpf_local_storage_map *)map; @@ -931,7 +936,12 @@ void bpf_local_storage_map_free(struct bpf_map *map, while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { - WARN_ON(bpf_selem_unlink(selem, true)); + + bpf_selem_unlink_lockless(selem, &free_selem_list, + BPF_LOCAL_STORAGE_MAP_FREE); + + bpf_selem_free_list(&free_selem_list, false); + cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 4d53aebe67848..7b2c8d428caa3 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -54,6 +54,7 @@ void bpf_task_storage_free(struct task_struct *task) goto out; bpf_local_storage_destroy(local_storage); + RCU_INIT_POINTER(task->bpf_storage, NULL); out: rcu_read_unlock(); } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 38acbecb8ef7f..64a52e57953c3 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -47,13 +47,18 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; + u32 uncharge; rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) goto out; - bpf_local_storage_destroy(sk_storage); + uncharge = bpf_local_storage_destroy(sk_storage); + if (uncharge) + atomic_sub(uncharge, &sk->sk_omem_alloc); + + RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); out: rcu_read_unlock_migrate(); } From 4e4e6fd302fdb74d20b33ff86196b227e3833db5 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:22 -0800 Subject: [PATCH 12/16] selftests/bpf: Update sk_storage_omem_uncharge test Check sk_omem_alloc when the caller of bpf_local_storage_destroy() returns. bpf_local_storage_destroy() now returns the memory to uncharge to the caller instead of directly uncharge. Therefore, in the sk_storage_omem_uncharge, check sk_omem_alloc when bpf_sk_storage_free() returns instead of bpf_local_storage_destroy(). Signed-off-by: Amery Hung --- .../selftests/bpf/progs/sk_storage_omem_uncharge.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c index 46d6eb2a3b177..c8f4815c8dfb6 100644 --- a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c +++ b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c @@ -6,7 +6,6 @@ #include #include -void *local_storage_ptr = NULL; void *sk_ptr = NULL; int cookie_found = 0; __u64 cookie = 0; @@ -19,21 +18,17 @@ struct { __type(value, int); } sk_storage SEC(".maps"); -SEC("fexit/bpf_local_storage_destroy") -int BPF_PROG(bpf_local_storage_destroy, struct bpf_local_storage *local_storage) +SEC("fexit/bpf_sk_storage_free") +int BPF_PROG(bpf_sk_storage_free, struct sock *sk) { - struct sock *sk; - - if (local_storage_ptr != local_storage) + if (sk_ptr != sk) return 0; - sk = bpf_core_cast(sk_ptr, struct sock); if (sk->sk_cookie.counter != cookie) return 0; cookie_found++; omem = sk->sk_omem_alloc.counter; - local_storage_ptr = NULL; return 0; } @@ -50,7 +45,6 @@ int BPF_PROG(inet6_sock_destruct, struct sock *sk) if (value && *value == 0xdeadbeef) { cookie_found++; sk_ptr = sk; - local_storage_ptr = sk->sk_bpf_storage; } return 0; From fd2787aa12530afb4d489b77f93b4788c3988976 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:23 -0800 Subject: [PATCH 13/16] selftests/bpf: Update task_local_storage/recursion test Update the expected result of the selftest as recursion of task local storage syscall and helpers have been relaxed. Now that the percpu counter is removed, task local storage helpers, bpf_task_storage_get() and bpf_task_storage_delete() can now run on the same CPU at the same time unless they cause deadlock. Note that since there is no percpu counter preventing recursion in task local storage helpers, bpf_trampoline now catches the recursion of on_update as reported by recursion_misses. on_enter: tp_btf/sys_enter on_update: fentry/bpf_local_storage_update Old behavior New behavior ____________ ____________ on_enter on_enter bpf_task_storage_get(&map_a) bpf_task_storage_get(&map_a) bpf_task_storage_trylock succeed bpf_local_storage_update(&map_a) bpf_local_storage_update(&map_a) on_update on_update bpf_task_storage_get(&map_a) bpf_task_storage_get(&map_a) bpf_task_storage_trylock fail on_update::misses++ (1) return NULL create and return map_a::ptr map_a::ptr += 1 (1) bpf_task_storage_delete(&map_a) return 0 bpf_task_storage_get(&map_b) bpf_task_storage_get(&map_b) bpf_task_storage_trylock fail on_update::misses++ (2) return NULL create and return map_b::ptr map_b::ptr += 1 (1) create and return map_a::ptr create and return map_a::ptr map_a::ptr = 200 map_a::ptr = 200 bpf_task_storage_get(&map_b) bpf_task_storage_get(&map_b) bpf_task_storage_trylock succeed lockless lookup succeed bpf_local_storage_update(&map_b) return map_b::ptr on_update bpf_task_storage_get(&map_a) bpf_task_storage_trylock fail lockless lookup succeed return map_a::ptr map_a::ptr += 1 (201) bpf_task_storage_delete(&map_a) bpf_task_storage_trylock fail return -EBUSY nr_del_errs++ (1) bpf_task_storage_get(&map_b) bpf_task_storage_trylock fail return NULL create and return ptr map_b::ptr = 100 Expected result: map_a::ptr = 201 map_a::ptr = 200 map_b::ptr = 100 map_b::ptr = 1 nr_del_err = 1 nr_del_err = 0 on_update::recursion_misses = 0 on_update::recursion_misses = 2 On_enter::recursion_misses = 0 on_enter::recursion_misses = 0 Signed-off-by: Amery Hung --- .../testing/selftests/bpf/prog_tests/task_local_storage.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c index 42e822ea352f1..559727b05e086 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -117,19 +117,19 @@ static void test_recursion(void) map_fd = bpf_map__fd(skel->maps.map_a); err = bpf_map_lookup_elem(map_fd, &task_fd, &value); ASSERT_OK(err, "lookup map_a"); - ASSERT_EQ(value, 201, "map_a value"); - ASSERT_EQ(skel->bss->nr_del_errs, 1, "bpf_task_storage_delete busy"); + ASSERT_EQ(value, 200, "map_a value"); + ASSERT_EQ(skel->bss->nr_del_errs, 0, "bpf_task_storage_delete busy"); map_fd = bpf_map__fd(skel->maps.map_b); err = bpf_map_lookup_elem(map_fd, &task_fd, &value); ASSERT_OK(err, "lookup map_b"); - ASSERT_EQ(value, 100, "map_b value"); + ASSERT_EQ(value, 1, "map_b value"); prog_fd = bpf_program__fd(skel->progs.on_update); memset(&info, 0, sizeof(info)); err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len); ASSERT_OK(err, "get prog info"); - ASSERT_EQ(info.recursion_misses, 0, "on_update prog recursion"); + ASSERT_EQ(info.recursion_misses, 2, "on_update prog recursion"); prog_fd = bpf_program__fd(skel->progs.on_enter); memset(&info, 0, sizeof(info)); From 40dbddf479931809af6f768d621ba386f56a8b48 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:24 -0800 Subject: [PATCH 14/16] selftests/bpf: Update task_local_storage/task_storage_nodeadlock test Adjsut the error code we are checking against as bpf_task_storage_get() now returns -EDEADLK or -ETIMEDOUT when deadlock happens. Signed-off-by: Amery Hung --- .../testing/selftests/bpf/progs/task_storage_nodeadlock.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c b/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c index 986829aaf73a6..6ce98fe9f387d 100644 --- a/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c +++ b/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c @@ -1,15 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" +#include #include #include char _license[] SEC("license") = "GPL"; -#ifndef EBUSY -#define EBUSY 16 -#endif - extern bool CONFIG_PREEMPTION __kconfig __weak; int nr_get_errs = 0; int nr_del_errs = 0; @@ -40,7 +37,7 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, ret = bpf_task_storage_delete(&task_storage, bpf_get_current_task_btf()); - if (ret == -EBUSY) + if (ret == -EDEADLK || ret == -ETIMEDOUT) __sync_fetch_and_add(&nr_del_errs, 1); return 0; From 3e1b3ca7e1ca37866ca7998d438209f690faed10 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:25 -0800 Subject: [PATCH 15/16] selftests/bpf: Remove test_task_storage_map_stress_lookup Remove a test in test_maps that checks if the updating of the percpu counter in task local storage map is preemption safe as the percpu counter is now removed. Signed-off-by: Amery Hung --- .../bpf/map_tests/task_storage_map.c | 128 ------------------ .../bpf/progs/read_bpf_task_storage_busy.c | 38 ------ 2 files changed, 166 deletions(-) delete mode 100644 tools/testing/selftests/bpf/map_tests/task_storage_map.c delete mode 100644 tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c deleted file mode 100644 index a4121d2248ac8..0000000000000 --- a/tools/testing/selftests/bpf/map_tests/task_storage_map.c +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "bpf_util.h" -#include "test_maps.h" -#include "task_local_storage_helpers.h" -#include "read_bpf_task_storage_busy.skel.h" - -struct lookup_ctx { - bool start; - bool stop; - int pid_fd; - int map_fd; - int loop; -}; - -static void *lookup_fn(void *arg) -{ - struct lookup_ctx *ctx = arg; - long value; - int i = 0; - - while (!ctx->start) - usleep(1); - - while (!ctx->stop && i++ < ctx->loop) - bpf_map_lookup_elem(ctx->map_fd, &ctx->pid_fd, &value); - return NULL; -} - -static void abort_lookup(struct lookup_ctx *ctx, pthread_t *tids, unsigned int nr) -{ - unsigned int i; - - ctx->stop = true; - ctx->start = true; - for (i = 0; i < nr; i++) - pthread_join(tids[i], NULL); -} - -void test_task_storage_map_stress_lookup(void) -{ -#define MAX_NR_THREAD 4096 - unsigned int i, nr = 256, loop = 8192, cpu = 0; - struct read_bpf_task_storage_busy *skel; - pthread_t tids[MAX_NR_THREAD]; - struct lookup_ctx ctx; - cpu_set_t old, new; - const char *cfg; - int err; - - cfg = getenv("TASK_STORAGE_MAP_NR_THREAD"); - if (cfg) { - nr = atoi(cfg); - if (nr > MAX_NR_THREAD) - nr = MAX_NR_THREAD; - } - cfg = getenv("TASK_STORAGE_MAP_NR_LOOP"); - if (cfg) - loop = atoi(cfg); - cfg = getenv("TASK_STORAGE_MAP_PIN_CPU"); - if (cfg) - cpu = atoi(cfg); - - skel = read_bpf_task_storage_busy__open_and_load(); - err = libbpf_get_error(skel); - CHECK(err, "open_and_load", "error %d\n", err); - - /* Only for a fully preemptible kernel */ - if (!skel->kconfig->CONFIG_PREEMPTION) { - printf("%s SKIP (no CONFIG_PREEMPTION)\n", __func__); - read_bpf_task_storage_busy__destroy(skel); - skips++; - return; - } - - /* Save the old affinity setting */ - sched_getaffinity(getpid(), sizeof(old), &old); - - /* Pinned on a specific CPU */ - CPU_ZERO(&new); - CPU_SET(cpu, &new); - sched_setaffinity(getpid(), sizeof(new), &new); - - ctx.start = false; - ctx.stop = false; - ctx.pid_fd = sys_pidfd_open(getpid(), 0); - ctx.map_fd = bpf_map__fd(skel->maps.task); - ctx.loop = loop; - for (i = 0; i < nr; i++) { - err = pthread_create(&tids[i], NULL, lookup_fn, &ctx); - if (err) { - abort_lookup(&ctx, tids, i); - CHECK(err, "pthread_create", "error %d\n", err); - goto out; - } - } - - ctx.start = true; - for (i = 0; i < nr; i++) - pthread_join(tids[i], NULL); - - skel->bss->pid = getpid(); - err = read_bpf_task_storage_busy__attach(skel); - CHECK(err, "attach", "error %d\n", err); - - /* Trigger program */ - sys_gettid(); - skel->bss->pid = 0; - - CHECK(skel->bss->busy != 0, "bad bpf_task_storage_busy", "got %d\n", skel->bss->busy); -out: - read_bpf_task_storage_busy__destroy(skel); - /* Restore affinity setting */ - sched_setaffinity(getpid(), sizeof(old), &old); - printf("%s:PASS\n", __func__); -} diff --git a/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c b/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c deleted file mode 100644 index 69da05bb6c63e..0000000000000 --- a/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ -#include "vmlinux.h" -#include -#include - -extern bool CONFIG_PREEMPTION __kconfig __weak; -extern const int bpf_task_storage_busy __ksym; - -char _license[] SEC("license") = "GPL"; - -int pid = 0; -int busy = 0; - -struct { - __uint(type, BPF_MAP_TYPE_TASK_STORAGE); - __uint(map_flags, BPF_F_NO_PREALLOC); - __type(key, int); - __type(value, long); -} task SEC(".maps"); - -SEC("raw_tp/sys_enter") -int BPF_PROG(read_bpf_task_storage_busy) -{ - int *value; - - if (!CONFIG_PREEMPTION) - return 0; - - if (bpf_get_current_pid_tgid() >> 32 != pid) - return 0; - - value = bpf_this_cpu_ptr(&bpf_task_storage_busy); - if (value) - busy = *value; - - return 0; -} From 8fcb1c3ca3a5a568c0cec4134c9ed4899fa5f84b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 18 Dec 2025 09:56:26 -0800 Subject: [PATCH 16/16] selftests/bpf: Choose another percpu variable in bpf for btf_dump test bpf_cgrp_storage_busy has been removed. Use bpf_bprintf_nest_level instead. This percpu variable is also in the bpf subsystem so that if it is removed in the future, BPF-CI will catch this type of CI- breaking change. Signed-off-by: Amery Hung --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 10cba526d3e63..f1642794f70ed 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -875,8 +875,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, "int cpu_number = (int)100", 100); #endif - TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT, - "static int bpf_cgrp_storage_busy = (int)2", 2); + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_bprintf_nest_level", int, BTF_F_COMPACT, + "static int bpf_bprintf_nest_level = (int)2", 2); } struct btf_dump_string_ctx {