diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a38e474bf7ee..ed0122b45b63 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) @@ -675,6 +676,12 @@ static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) return NULL; } +static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map, + void *key) +{ + return NULL; +} + static inline int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) @@ -724,6 +731,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; +extern const struct bpf_func_proto bpf_sock_hash_update_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index d7df1b323082..b67f8793de0d 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -47,6 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 02e4112510f8..d94d333a8225 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, }; enum bpf_prog_type { @@ -1828,7 +1829,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. @@ -1855,6 +1855,53 @@ union bpf_attr { * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case * of failure. + * + * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdeict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1926,7 +1973,10 @@ union bpf_attr { FN(skb_get_xfrm_state), \ FN(get_stack), \ FN(skb_load_bytes_relative), \ - FN(fib_lookup), + FN(fib_lookup), \ + FN(sock_hash_update), \ + FN(msg_redirect_hash), \ + FN(sk_redirect_hash), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d0d7d9462368..2194c6a9df42 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1707,6 +1707,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index beab9ec9b023..56879c9fd3a4 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -60,6 +60,28 @@ struct bpf_stab { struct bpf_sock_progs progs; }; +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_htab { + struct bpf_map map; + struct bucket *buckets; + atomic_t count; + u32 n_buckets; + u32 elem_size; + struct bpf_sock_progs progs; +}; + +struct htab_elem { + struct rcu_head rcu; + struct hlist_node hash_node; + u32 hash; + struct sock *sk; + char key[0]; +}; + enum smap_psock_state { SMAP_TX_RUNNING, }; @@ -67,6 +89,8 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; struct sock **entry; + struct htab_elem *hash_link; + struct bpf_htab *htab; }; struct smap_psock { @@ -195,6 +219,12 @@ out: rcu_read_unlock(); } +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + atomic_dec(&htab->count); + kfree_rcu(l, rcu); +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -231,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } list_for_each_entry_safe(e, tmp, &psock->maps, list) { - osk = cmpxchg(e->entry, sk, NULL); - if (osk == sk) { - list_del(&e->list); - smap_release_sock(psock, sk); + if (e->entry) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } else { + hlist_del_rcu(&e->hash_link->hash_node); + smap_release_sock(psock, e->hash_link->sk); + free_htab_elem(e->htab, e->hash_link); } } write_unlock_bh(&sk->sk_callback_lock); @@ -1527,12 +1563,14 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, struct sock **entry) +static void smap_list_remove(struct smap_psock *psock, + struct sock **entry, + struct htab_elem *hash_link) { struct smap_psock_map_entry *e, *tmp; list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { + if (e->entry == entry || e->hash_link == hash_link) { list_del(&e->list); break; } @@ -1570,7 +1608,7 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i]); + smap_list_remove(psock, &stab->sock_map[i], NULL); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); @@ -1629,7 +1667,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (psock->bpf_parse) smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k]); + smap_list_remove(psock, &stab->sock_map[k], NULL); smap_release_sock(psock, sock); out: write_unlock_bh(&sock->sk_callback_lock); @@ -1746,10 +1784,12 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, new = true; } - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_progs; + if (map_link) { + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) { + err = -ENOMEM; + goto out_progs; + } } /* 3. At this point we have a reference to a valid psock that is @@ -1783,6 +1823,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, write_unlock_bh(&sock->sk_callback_lock); return err; out_free: + kfree(e); smap_release_sock(psock, sock); out_progs: if (verdict) @@ -1829,7 +1870,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i]); + smap_list_remove(opsock, &stab->sock_map[i], NULL); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); } @@ -1846,6 +1887,10 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) struct bpf_stab *stab = container_of(map, struct bpf_stab, map); progs = &stab->progs; + } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; } else { return -EINVAL; } @@ -1906,11 +1951,19 @@ static int sock_map_update_elem(struct bpf_map *map, static void sock_map_release(struct bpf_map *map) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_sock_progs *progs; struct bpf_prog *orig; - progs = &stab->progs; + if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + progs = &stab->progs; + } else { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; + } + orig = xchg(&progs->bpf_parse, NULL); if (orig) bpf_prog_put(orig); @@ -1923,6 +1976,390 @@ static void sock_map_release(struct bpf_map *map) bpf_prog_put(orig); } +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int i, err; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + err = bpf_tcp_ulp_register(); + if (err && err != -EEXIST) + return ERR_PTR(err); + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8); + err = -EINVAL; + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct bucket)) + goto free_htab; + + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) + goto free_htab; + + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; + + err = -ENOMEM; + htab->buckets = bpf_map_area_alloc( + htab->n_buckets * sizeof(struct bucket), + htab->map.numa_node); + if (!htab->buckets) + goto free_htab; + + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; + + synchronize_rcu(); + + /* At this point no update, lookup or delete operations can happen. + * However, be aware we can still get a socket state event updates, + * and data ready callabacks that reference the psock from sk_user_data + * Also psock worker threads are still in-flight. So smap_release_sock + * will only free the psock after cancel_sync on the worker threads + * and a grace period expire to ensure psock is really safe to remove. + */ + rcu_read_lock(); + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get + * the sk_callback_lock before this case but after xchg + * causing the refcnt to hit zero and sock user data + * (psock) to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + kfree(l); + } + } + rcu_read_unlock(); + bpf_map_area_free(htab->buckets); + kfree(htab); +} + +static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, + void *key, u32 key_size, u32 hash, + struct sock *sk, + struct htab_elem *old_elem) +{ + struct htab_elem *l_new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old_elem) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!l_new) + return ERR_PTR(-ENOMEM); + + memcpy(l_new->key, key, key_size); + l_new->sk = sk; + l_new->hash = hash; + return l_new; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static int sock_hash_get_next_key(struct bpf_map *map, + void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l, *next_l; + struct hlist_head *h; + u32 hash, key_size; + int i = 0; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + if (!key) + goto find_first_elem; + hash = htab_map_hash(key, key_size); + h = select_bucket(htab, hash); + + l = lookup_elem_raw(h, hash, key, key_size); + if (!l) + goto find_first_elem; + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + if (next_l) { + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + h = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(h)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* iterated over all buckets and all elements */ + return -ENOENT; +} + +static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_sock_progs *progs = &htab->progs; + struct htab_elem *l_new = NULL, *l_old; + struct smap_psock_map_entry *e = NULL; + struct hlist_head *head; + struct smap_psock *psock; + u32 key_size, hash; + struct sock *sock; + struct bucket *b; + int err; + + sock = skops->sk; + + if (sock->sk_type != SOCK_STREAM || + sock->sk_protocol != IPPROTO_TCP) + return -EOPNOTSUPP; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; + + WARN_ON_ONCE(!rcu_read_lock_held()); + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + if (err) + goto err; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_bh(&b->lock); + l_old = lookup_elem_raw(head, hash, key, key_size); + if (l_old && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto bucket_err; + } + if (!l_old && map_flags == BPF_EXIST) { + err = -ENOENT; + goto bucket_err; + } + + l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); + if (IS_ERR(l_new)) { + err = PTR_ERR(l_new); + goto bucket_err; + } + + psock = smap_psock_sk(sock); + if (unlikely(!psock)) { + err = -EINVAL; + goto bucket_err; + } + + e->hash_link = l_new; + e->htab = container_of(map, struct bpf_htab, map); + list_add_tail(&e->list, &psock->maps); + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + psock = smap_psock_sk(l_old->sk); + + hlist_del_rcu(&l_old->hash_node); + smap_list_remove(psock, NULL, l_old); + smap_release_sock(psock, l_old->sk); + free_htab_elem(htab, l_old); + } + raw_spin_unlock_bh(&b->lock); + return 0; +bucket_err: + raw_spin_unlock_bh(&b->lock); +err: + kfree(e); + psock = smap_psock_sk(sock); + if (psock) + smap_release_sock(psock, sock); + return err; +} + +static int sock_hash_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_sock_ops_kern skops; + u32 fd = *(u32 *)value; + struct socket *socket; + int err; + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + skops.sk = socket->sk; + if (!skops.sk) { + fput(socket->file); + return -EINVAL; + } + + err = sock_hash_ctx_update_elem(&skops, map, key, flags); + fput(socket->file); + return err; +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + u32 hash, key_size; + int ret = -ENOENT; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + if (l) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + free_htab_elem(htab, l); + ret = 0; + } + raw_spin_unlock_bh(&b->lock); + return ret; +} + +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 key_size, hash; + struct bucket *b; + struct sock *sk; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + sk = l ? l->sk : NULL; + raw_spin_unlock_bh(&b->lock); + return sk; +} + const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -1933,6 +2370,15 @@ const struct bpf_map_ops sock_map_ops = { .map_release_uref = sock_map_release, }; +const struct bpf_map_ops sock_hash_ops = { + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_lookup_elem = sock_map_lookup, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_hash_update_elem, + .map_delete_elem = sock_hash_delete_elem, +}; + BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { @@ -1950,3 +2396,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d92d9c37affd..a9e4b1372da6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2093,6 +2093,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_map) goto error; break; + case BPF_MAP_TYPE_SOCKHASH: + if (func_id != BPF_FUNC_sk_redirect_hash && + func_id != BPF_FUNC_sock_hash_update && + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_hash) + goto error; + break; default: break; } @@ -2130,11 +2137,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_sk_redirect_map: case BPF_FUNC_msg_redirect_map: + case BPF_FUNC_sock_map_update: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; - case BPF_FUNC_sock_map_update: - if (map->map_type != BPF_MAP_TYPE_SOCKMAP) + case BPF_FUNC_sk_redirect_hash: + case BPF_FUNC_msg_redirect_hash: + case BPF_FUNC_sock_hash_update: + if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; default: diff --git a/net/core/filter.c b/net/core/filter.c index 61a3ed6bac25..6d0d1560bd70 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2074,6 +2074,33 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + + return SK_PASS; +} + +static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { @@ -2108,6 +2135,31 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, + struct bpf_map *, map, void *, key, u64, flags) +{ + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + msg->flags = flags; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + + return SK_PASS; +} + +static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, struct bpf_map *, map, u32, key, u64, flags) { @@ -4502,6 +4554,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; + case BPF_FUNC_sock_hash_update: + return &bpf_sock_hash_update_proto; default: return bpf_base_func_proto(func_id); } @@ -4513,6 +4567,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_redirect_hash: + return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: @@ -4544,6 +4600,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; + case BPF_FUNC_sk_redirect_hash: + return &bpf_sk_redirect_hash_proto; default: return bpf_base_func_proto(func_id); }