From 1da6c4d9140cb7c13e87667dc4e1488d6c8fc10f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Mar 2019 15:54:43 +0100 Subject: [PATCH 01/10] bpf: fix use after free in bpf_evict_inode syzkaller was able to generate the following UAF in bpf: BUG: KASAN: use-after-free in lookup_last fs/namei.c:2269 [inline] BUG: KASAN: use-after-free in path_lookupat.isra.43+0x9f8/0xc00 fs/namei.c:2318 Read of size 1 at addr ffff8801c4865c47 by task syz-executor2/9423 CPU: 0 PID: 9423 Comm: syz-executor2 Not tainted 4.20.0-rc1-next-20181109+ #110 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x244/0x39d lib/dump_stack.c:113 print_address_description.cold.7+0x9/0x1ff mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.8+0x242/0x309 mm/kasan/report.c:412 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430 lookup_last fs/namei.c:2269 [inline] path_lookupat.isra.43+0x9f8/0xc00 fs/namei.c:2318 filename_lookup+0x26a/0x520 fs/namei.c:2348 user_path_at_empty+0x40/0x50 fs/namei.c:2608 user_path include/linux/namei.h:62 [inline] do_mount+0x180/0x1ff0 fs/namespace.c:2980 ksys_mount+0x12d/0x140 fs/namespace.c:3258 __do_sys_mount fs/namespace.c:3272 [inline] __se_sys_mount fs/namespace.c:3269 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3269 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457569 Code: fd b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fde6ed96c78 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 0000000000457569 RDX: 0000000020000040 RSI: 0000000020000000 RDI: 0000000000000000 RBP: 000000000072bf00 R08: 0000000020000340 R09: 0000000000000000 R10: 0000000000200000 R11: 0000000000000246 R12: 00007fde6ed976d4 R13: 00000000004c2c24 R14: 00000000004d4990 R15: 00000000ffffffff Allocated by task 9424: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553 __do_kmalloc mm/slab.c:3722 [inline] __kmalloc_track_caller+0x157/0x760 mm/slab.c:3737 kstrdup+0x39/0x70 mm/util.c:49 bpf_symlink+0x26/0x140 kernel/bpf/inode.c:356 vfs_symlink+0x37a/0x5d0 fs/namei.c:4127 do_symlinkat+0x242/0x2d0 fs/namei.c:4154 __do_sys_symlink fs/namei.c:4173 [inline] __se_sys_symlink fs/namei.c:4171 [inline] __x64_sys_symlink+0x59/0x80 fs/namei.c:4171 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 9425: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kfree+0xcf/0x230 mm/slab.c:3817 bpf_evict_inode+0x11f/0x150 kernel/bpf/inode.c:565 evict+0x4b9/0x980 fs/inode.c:558 iput_final fs/inode.c:1550 [inline] iput+0x674/0xa90 fs/inode.c:1576 do_unlinkat+0x733/0xa30 fs/namei.c:4069 __do_sys_unlink fs/namei.c:4110 [inline] __se_sys_unlink fs/namei.c:4108 [inline] __x64_sys_unlink+0x42/0x50 fs/namei.c:4108 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe In this scenario path lookup under RCU is racing with the final unlink in case of symlinks. As Linus puts it in his analysis: [...] We actually RCU-delay the inode freeing itself, but when we do the final iput(), the "evict()" function is called synchronously. Now, the simple fix would seem to just RCU-delay the kfree() of the symlink data in bpf_evict_inode(). Maybe that's the right thing to do. [...] Al suggested to piggy-back on the ->destroy_inode() callback in order to implement RCU deferral there which can then kfree() the inode->i_link eventually right before putting inode back into inode cache. By reusing free_inode_nonrcu() from there we can avoid the need for our own inode cache and just reuse generic one as we currently do. And in-fact on top of all this we should just get rid of the bpf_evict_inode() entirely. This means truncate_inode_pages_final() and clear_inode() will then simply be called by the fs core via evict(). Dropping the reference should really only be done when inode is unhashed and nothing reachable anymore, so it's better also moved into the final ->destroy_inode() callback. Fixes: 0f98621bef5d ("bpf, inode: add support for symlinks and fix mtime/ctime") Reported-by: syzbot+fb731ca573367b7f6564@syzkaller.appspotmail.com Reported-by: syzbot+a13e5ead792d6df37818@syzkaller.appspotmail.com Reported-by: syzbot+7a8ba368b47fdefca61e@syzkaller.appspotmail.com Suggested-by: Al Viro Analyzed-by: Linus Torvalds Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Linus Torvalds Acked-by: Al Viro Link: https://lore.kernel.org/lkml/0000000000006946d2057bbd0eef@google.com/T/ --- kernel/bpf/inode.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 2ada5e21dfa6..4a8f390a2b82 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -554,19 +554,6 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ } EXPORT_SYMBOL(bpf_prog_get_type_path); -static void bpf_evict_inode(struct inode *inode) -{ - enum bpf_type type; - - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); - if (!bpf_inode_type(inode, &type)) - bpf_any_put(inode->i_private, type); -} - /* * Display the mount options in /proc/mounts. */ @@ -579,11 +566,28 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void bpf_destroy_inode_deferred(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + enum bpf_type type; + + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); + if (!bpf_inode_type(inode, &type)) + bpf_any_put(inode->i_private, type); + free_inode_nonrcu(inode); +} + +static void bpf_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred); +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, - .evict_inode = bpf_evict_inode, + .destroy_inode = bpf_destroy_inode, }; enum { From 927cb78177ae3773d0d27404566a93cb8e88890c Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 20 Mar 2019 13:58:27 +0100 Subject: [PATCH 02/10] bpf: remove incorrect 'verifier bug' warning The BPF verifier checks the maximum number of call stack frames twice, first in the main CFG traversal (do_check) and then in a subsequent traversal (check_max_stack_depth). If the second check fails, it logs a 'verifier bug' warning and errors out, as the number of call stack frames should have been verified already. However, the second check may fail without indicating a verifier bug: if the excessive function calls reside in dead code, the main CFG traversal may not visit them; the subsequent traversal visits all instructions, including dead code. This case raises the question of how invalid dead code should be treated. This patch implements the conservative option and rejects such code. Signed-off-by: Paul Chaignon Tested-by: Xiao Han Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fd502c1f71eb..6c5a41f7f338 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1897,8 +1897,9 @@ continue_func: } frame++; if (frame >= MAX_CALL_FRAMES) { - WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); - return -EFAULT; + verbose(env, "the call stack of %d frames is too deep !\n", + frame); + return -E2BIG; } goto process_func; } From cabacfbbe54ec6730b3c147599763892c6c03525 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 20 Mar 2019 13:58:50 +0100 Subject: [PATCH 03/10] selftests/bpf: test case for invalid call stack in dead code This patch adds a test case with an excessive number of call stack frames in dead code. Signed-off-by: Paul Chaignon Tested-by: Xiao Han Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/calls.c | 38 ++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index f2ccae39ee66..fb11240b758b 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -907,6 +907,44 @@ .errstr = "call stack", .result = REJECT, }, +{ + "calls: stack depth check in dead code", + .insns = { + /* main */ + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call A */ + BPF_EXIT_INSN(), + /* A */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 2), /* call B */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* B */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call C */ + BPF_EXIT_INSN(), + /* C */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call D */ + BPF_EXIT_INSN(), + /* D */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call E */ + BPF_EXIT_INSN(), + /* E */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call F */ + BPF_EXIT_INSN(), + /* F */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call G */ + BPF_EXIT_INSN(), + /* G */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */ + BPF_EXIT_INSN(), + /* H */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + .errstr = "call stack", + .result = REJECT, +}, { "calls: spill into caller stack frame", .insns = { From f52c97d9df983cc38a8809d0910e5eaba0c180b3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 25 Mar 2019 15:12:15 +0100 Subject: [PATCH 04/10] bpf, doc: fix BTF docs reflow of bullet list Section 2.2.1 BTF_KIND_INT a bullet list was collapsed due to text reflow in commit 9ab5305dbe3f ("docs/btf: reflow text to fill up to 78 characters"). This patch correct the mistake. Also adjust next bullet list, which is used for comparison, to get rendered the same way. Fixes: 9ab5305dbe3f ("docs/btf: reflow text to fill up to 78 characters") Link: https://www.kernel.org/doc/html/latest/bpf/btf.html#btf-kind-int Signed-off-by: Jesper Dangaard Brouer Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- Documentation/bpf/btf.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 9a60a5d60e38..7313d354f20e 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -148,16 +148,16 @@ The ``btf_type.size * 8`` must be equal to or greater than ``BTF_INT_BITS()`` for the type. The maximum value of ``BTF_INT_BITS()`` is 128. The ``BTF_INT_OFFSET()`` specifies the starting bit offset to calculate values -for this int. For example, a bitfield struct member has: * btf member bit -offset 100 from the start of the structure, * btf member pointing to an int -type, * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` +for this int. For example, a bitfield struct member has: + * btf member bit offset 100 from the start of the structure, + * btf member pointing to an int type, + * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` Then in the struct memory layout, this member will occupy ``4`` bits starting from bits ``100 + 2 = 102``. Alternatively, the bitfield struct member can be the following to access the same bits as the above: - * btf member bit offset 102, * btf member pointing to an int type, * the int type has ``BTF_INT_OFFSET() = 0`` and ``BTF_INT_BITS() = 4`` From 9ec71c1cdbdd6c4ac0150a51d64e06c5d1bd207e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2019 22:00:06 -0700 Subject: [PATCH 05/10] libbpf: fix btf_dedup equivalence check handling of different kinds btf_dedup_is_equiv() used to compare btf_type->info fields, before doing kind-specific equivalence check. This comparsion implicitly verified that candidate and canonical types are of the same kind. With enum fwd resolution logic this check couldn't be done generically anymore, as for enums info contains vlen, which differs between enum fwd and fully-defined enum, so this check was subsumed by kind-specific equivalence checks. This change caused btf_dedup_is_equiv() to let through VOID vs other types check to reach switch, which was never meant to be handing VOID kind, as VOID kind is always pre-resolved to itself and is only equivalent to itself, which is checked early in btf_dedup_is_equiv(). This change adds back BTF kind equality check in place of more generic btf_type->info check, still defering further kind-specific checks to a per-kind switch. Fixes: 9768095ba97c ("btf: resolve enum fwds in btf_dedup") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 87e3020ac1bc..cf119c9b6f27 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -2107,6 +2107,9 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, return fwd_kind == real_kind; } + if (cand_kind != canon_kind) + return 0; + switch (cand_kind) { case BTF_KIND_INT: return btf_equal_int(cand_type, canon_type); From eb76899ce749507e09cad6816f32cede14a9b7ee Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2019 22:00:07 -0700 Subject: [PATCH 06/10] selftests/bpf: add btf_dedup test for VOID equivalence check This patch adds specific test exposing bug in btf_dedup_is_equiv() when comparing candidate VOID type to a non-VOID canonical type. It's important for canonical type to be anonymous, otherwise name equality check will do the right thing and will exit early. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_btf.c | 47 ++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 23e3b314ca60..ec5794e4205b 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -5776,6 +5776,53 @@ const struct btf_dedup_test dedup_tests[] = { .dedup_table_size = 1, /* force hash collisions */ }, }, +{ + .descr = "dedup: void equiv check", + /* + * // CU 1: + * struct s { + * struct {} *x; + * }; + * // CU 2: + * struct s { + * int *x; + * }; + */ + .input = { + .raw_types = { + /* CU 1 */ + BTF_STRUCT_ENC(0, 0, 1), /* [1] struct {} */ + BTF_PTR_ENC(1), /* [2] ptr -> [1] */ + BTF_STRUCT_ENC(NAME_NTH(1), 1, 8), /* [3] struct s */ + BTF_MEMBER_ENC(NAME_NTH(2), 2, 0), + /* CU 2 */ + BTF_PTR_ENC(0), /* [4] ptr -> void */ + BTF_STRUCT_ENC(NAME_NTH(1), 1, 8), /* [5] struct s */ + BTF_MEMBER_ENC(NAME_NTH(2), 4, 0), + BTF_END_RAW, + }, + BTF_STR_SEC("\0s\0x"), + }, + .expect = { + .raw_types = { + /* CU 1 */ + BTF_STRUCT_ENC(0, 0, 1), /* [1] struct {} */ + BTF_PTR_ENC(1), /* [2] ptr -> [1] */ + BTF_STRUCT_ENC(NAME_NTH(1), 1, 8), /* [3] struct s */ + BTF_MEMBER_ENC(NAME_NTH(2), 2, 0), + /* CU 2 */ + BTF_PTR_ENC(0), /* [4] ptr -> void */ + BTF_STRUCT_ENC(NAME_NTH(1), 1, 8), /* [5] struct s */ + BTF_MEMBER_ENC(NAME_NTH(2), 4, 0), + BTF_END_RAW, + }, + BTF_STR_SEC("\0s\0x"), + }, + .opts = { + .dont_resolve_fwds = false, + .dedup_table_size = 1, /* force hash collisions */ + }, +}, { .descr = "dedup: all possible kinds (no duplicates)", .input = { From 379e2014c95b7a454713da822b8ef4ec51ab8a75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 27 Mar 2019 14:51:13 +0100 Subject: [PATCH 07/10] libbpf: add xsk.h to install_headers target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xsk.h header file was missing from the install_headers target in the Makefile. This patch simply adds xsk.h to the set of installed headers. Fixes: 1cad07884239 ("libbpf: add support for using AF_XDP sockets") Reported-by: Bruce Richardson Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- tools/lib/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 5bf8e52c41fc..279589c29983 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -222,6 +222,7 @@ install_headers: $(call do_install,bpf.h,$(prefix)/include/bpf,644); \ $(call do_install,libbpf.h,$(prefix)/include/bpf,644); $(call do_install,btf.h,$(prefix)/include/bpf,644); + $(call do_install,xsk.h,$(prefix)/include/bpf,644); install: install_lib From 89dedaef49d36adc2bb5e7e4c38b52fa3013c7c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 27 Mar 2019 14:51:14 +0100 Subject: [PATCH 08/10] libbpf: add libelf dependency to shared library build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DPDK project is moving forward with its AF_XDP PMD, and during that process some libbpf issues surfaced [1]: When libbpf was built as a shared library, libelf was not included in the linking phase. Since libelf is an internal depedency to libbpf, libelf should be included. This patch adds '-lelf' to resolve that. [1] https://patches.dpdk.org/patch/50704/#93571 Fixes: 1b76c13e4b36 ("bpf tools: Introduce 'bpf' library and add bpf feature check") Suggested-by: Luca Boccassi Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- tools/lib/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 279589c29983..7beec4d5b522 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -177,7 +177,7 @@ $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) $(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN) $(QUIET_LINK)$(CC) --shared -Wl,-soname,libbpf.so.$(VERSION) \ - -Wl,--version-script=$(VERSION_SCRIPT) $^ -o $@ + -Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -o $@ @ln -sf $(@F) $(OUTPUT)libbpf.so @ln -sf $(@F) $(OUTPUT)libbpf.so.$(VERSION) From 8543e437807970166c2b66b79935c9f4b0e6d1f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 28 Mar 2019 16:44:28 +0100 Subject: [PATCH 09/10] bpf, libbpf: fix quiet install_headers Both btf.h and xsk.h headers are not installed quietly due to missing '\' for the call to QUIET_INSTALL. Lets fix it. Before: # make install_headers INSTALL headers if [ ! -d '''/usr/local/include/bpf' ]; then install -d -m 755 '''/usr/local/include/bpf'; fi; install btf.h -m 644 '''/usr/local/include/bpf'; if [ ! -d '''/usr/local/include/bpf' ]; then install -d -m 755 '''/usr/local/include/bpf'; fi; install xsk.h -m 644 '''/usr/local/include/bpf'; # ls /usr/local/include/bpf/ bpf.h btf.h libbpf.h xsk.h After: # make install_headers INSTALL headers # ls /usr/local/include/bpf/ bpf.h btf.h libbpf.h xsk.h Fixes: a493f5f9d8c2 ("libbpf: Install btf.h with libbpf") Fixes: 379e2014c95b ("libbpf: add xsk.h to install_headers target") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Andrii Nakryiko --- tools/lib/bpf/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 7beec4d5b522..8e7c56e9590f 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -220,8 +220,8 @@ install_lib: all_cmd install_headers: $(call QUIET_INSTALL, headers) \ $(call do_install,bpf.h,$(prefix)/include/bpf,644); \ - $(call do_install,libbpf.h,$(prefix)/include/bpf,644); - $(call do_install,btf.h,$(prefix)/include/bpf,644); + $(call do_install,libbpf.h,$(prefix)/include/bpf,644); \ + $(call do_install,btf.h,$(prefix)/include/bpf,644); \ $(call do_install,xsk.h,$(prefix)/include/bpf,644); install: install_lib From 676e4a6fe703f2dae699ee9d56f14516f9ada4ea Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 29 Mar 2019 10:18:00 +0100 Subject: [PATCH 10/10] xdp: fix cpumap redirect SKB creation bug We want to avoid leaking pointer info from xdp_frame (that is placed in top of frame) like commit 6dfb970d3dbd ("xdp: avoid leaking info stored in frame data on page reuse"), and followup commit 97e19cce05e5 ("bpf: reserve xdp_frame size in xdp headroom") that reserve this headroom. These changes also affected how cpumap constructed SKBs, as xdpf->headroom size changed, the skb data starting point were in-effect shifted with 32 bytes (sizeof xdp_frame). This was still okay, as the cpumap frame_size calculation also included xdpf->headroom which were reduced by same amount. A bug was introduced in commit 77ea5f4cbe20 ("bpf/cpumap: make sure frame_size for build_skb is aligned if headroom isn't"), where the xdpf->headroom became part of the SKB_DATA_ALIGN rounding up. This round-up to find the frame_size is in principle still correct as it does not exceed the 2048 bytes frame_size (which is max for ixgbe and i40e), but the 32 bytes offset of pkt_data_start puts this over the 2048 bytes limit. This cause skb_shared_info to spill into next frame. It is a little hard to trigger, as the SKB need to use above 15 skb_shinfo->frags[] as far as I calculate. This does happen in practise for TCP streams when skb_try_coalesce() kicks in. KASAN can be used to detect these wrong memory accesses, I've seen: BUG: KASAN: use-after-free in skb_try_coalesce+0x3cb/0x760 BUG: KASAN: wild-memory-access in skb_release_data+0xe2/0x250 Driver veth also construct a SKB from xdp_frame in this way, but is not affected, as it doesn't reserve/deduct the room (used by xdp_frame) from the SKB headroom. Instead is clears the pointers via xdp_scrub_frame(), and allows SKB to use this area. The fix in this patch is to do like veth and instead allow SKB to (re)use the area occupied by xdp_frame, by clearing via xdp_scrub_frame(). (This does kill the idea of the SKB being able to access (mem) info from this area, but I guess it was a bad idea anyhow, and it was already killed by the veth changes.) Fixes: 77ea5f4cbe20 ("bpf/cpumap: make sure frame_size for build_skb is aligned if headroom isn't") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8974b3755670..3c18260403dd 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -162,10 +162,14 @@ static void cpu_map_kthread_stop(struct work_struct *work) static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + unsigned int hard_start_headroom; unsigned int frame_size; void *pkt_data_start; struct sk_buff *skb; + /* Part of headroom was reserved to xdpf */ + hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; + /* build_skb need to place skb_shared_info after SKB end, and * also want to know the memory "truesize". Thus, need to * know the memory frame size backing xdp_buff. @@ -183,15 +187,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) + + frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - pkt_data_start = xdpf->data - xdpf->headroom; + pkt_data_start = xdpf->data - hard_start_headroom; skb = build_skb(pkt_data_start, frame_size); if (!skb) return NULL; - skb_reserve(skb, xdpf->headroom); + skb_reserve(skb, hard_start_headroom); __skb_put(skb, xdpf->len); if (xdpf->metasize) skb_metadata_set(skb, xdpf->metasize); @@ -205,6 +209,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * - RX ring dev queue index (skb_record_rx_queue) */ + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + return skb; }