tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive
When adding tcp mmap() implementation, I forgot that socket lock
had to be taken before current->mm->mmap_sem. syzbot eventually caught
the bug.
Since we can not lock the socket in tcp mmap() handler we have to
split the operation in two phases.
1) mmap() on a tcp socket simply reserves VMA space, and nothing else.
This operation does not involve any TCP locking.
2) getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements
the transfert of pages from skbs to one VMA.
This operation only uses down_read(¤t->mm->mmap_sem) after
holding TCP lock, thus solving the lockdep issue.
This new implementation was suggested by Andy Lutomirski with great details.
Benefits are :
- Better scalability, in case multiple threads reuse VMAS
(without mmap()/munmap() calls) since mmap_sem wont be write locked.
- Better error recovery.
The previous mmap() model had to provide the expected size of the
mapping. If for some reason one part could not be mapped (partial MSS),
the whole operation had to be aborted.
With the tcp_zerocopy_receive struct, kernel can report how
many bytes were successfuly mapped, and how many bytes should
be read to skip the problematic sequence.
- No more memory allocation to hold an array of page pointers.
16 MB mappings needed 32 KB for this array, potentially using vmalloc() :/
- skbs are freed while mmap_sem has been released
Following patch makes the change in tcp_mmap tool to demonstrate
one possible use of mmap() and setsockopt(... TCP_ZEROCOPY_RECEIVE ...)
Note that memcg might require additional changes.
Fixes: 93ab6cc691
("tcp: implement mmap() for zero copy receive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Cc: linux-mm@kvack.org
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
589f84fb95
commit
05255b823a
|
@ -122,6 +122,7 @@ enum {
|
||||||
#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */
|
#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */
|
||||||
#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */
|
#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */
|
||||||
#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */
|
#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */
|
||||||
|
#define TCP_ZEROCOPY_RECEIVE 35
|
||||||
|
|
||||||
struct tcp_repair_opt {
|
struct tcp_repair_opt {
|
||||||
__u32 opt_code;
|
__u32 opt_code;
|
||||||
|
@ -276,4 +277,11 @@ struct tcp_diag_md5sig {
|
||||||
__u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN];
|
__u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
|
||||||
|
|
||||||
|
struct tcp_zerocopy_receive {
|
||||||
|
__u64 address; /* in: address of mapping */
|
||||||
|
__u32 length; /* in/out: number of bytes to map/mapped */
|
||||||
|
__u32 recv_skip_hint; /* out: amount of bytes to skip */
|
||||||
|
};
|
||||||
#endif /* _UAPI_LINUX_TCP_H */
|
#endif /* _UAPI_LINUX_TCP_H */
|
||||||
|
|
|
@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
|
||||||
.getsockopt = sock_common_getsockopt,
|
.getsockopt = sock_common_getsockopt,
|
||||||
.sendmsg = inet_sendmsg,
|
.sendmsg = inet_sendmsg,
|
||||||
.recvmsg = inet_recvmsg,
|
.recvmsg = inet_recvmsg,
|
||||||
|
#ifdef CONFIG_MMU
|
||||||
.mmap = tcp_mmap,
|
.mmap = tcp_mmap,
|
||||||
|
#endif
|
||||||
.sendpage = inet_sendpage,
|
.sendpage = inet_sendpage,
|
||||||
.splice_read = tcp_splice_read,
|
.splice_read = tcp_splice_read,
|
||||||
.read_sock = tcp_read_sock,
|
.read_sock = tcp_read_sock,
|
||||||
|
|
196
net/ipv4/tcp.c
196
net/ipv4/tcp.c
|
@ -1726,118 +1726,113 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(tcp_set_rcvlowat);
|
EXPORT_SYMBOL(tcp_set_rcvlowat);
|
||||||
|
|
||||||
/* When user wants to mmap X pages, we first need to perform the mapping
|
#ifdef CONFIG_MMU
|
||||||
* before freeing any skbs in receive queue, otherwise user would be unable
|
static const struct vm_operations_struct tcp_vm_ops = {
|
||||||
* to fallback to standard recvmsg(). This happens if some data in the
|
};
|
||||||
* requested block is not exactly fitting in a page.
|
|
||||||
*
|
|
||||||
* We only support order-0 pages for the moment.
|
|
||||||
* mmap() on TCP is very strict, there is no point
|
|
||||||
* trying to accommodate with pathological layouts.
|
|
||||||
*/
|
|
||||||
int tcp_mmap(struct file *file, struct socket *sock,
|
int tcp_mmap(struct file *file, struct socket *sock,
|
||||||
struct vm_area_struct *vma)
|
struct vm_area_struct *vma)
|
||||||
{
|
{
|
||||||
unsigned long size = vma->vm_end - vma->vm_start;
|
if (vma->vm_flags & (VM_WRITE | VM_EXEC))
|
||||||
unsigned int nr_pages = size >> PAGE_SHIFT;
|
return -EPERM;
|
||||||
struct page **pages_array = NULL;
|
vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
|
||||||
u32 seq, len, offset, nr = 0;
|
|
||||||
struct sock *sk = sock->sk;
|
/* Instruct vm_insert_page() to not down_read(mmap_sem) */
|
||||||
const skb_frag_t *frags;
|
vma->vm_flags |= VM_MIXEDMAP;
|
||||||
|
|
||||||
|
vma->vm_ops = &tcp_vm_ops;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(tcp_mmap);
|
||||||
|
|
||||||
|
static int tcp_zerocopy_receive(struct sock *sk,
|
||||||
|
struct tcp_zerocopy_receive *zc)
|
||||||
|
{
|
||||||
|
unsigned long address = (unsigned long)zc->address;
|
||||||
|
const skb_frag_t *frags = NULL;
|
||||||
|
u32 length = 0, seq, offset;
|
||||||
|
struct vm_area_struct *vma;
|
||||||
|
struct sk_buff *skb = NULL;
|
||||||
struct tcp_sock *tp;
|
struct tcp_sock *tp;
|
||||||
struct sk_buff *skb;
|
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
if (vma->vm_pgoff || !nr_pages)
|
if (address & (PAGE_SIZE - 1) || address != zc->address)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
if (vma->vm_flags & VM_WRITE)
|
|
||||||
return -EPERM;
|
|
||||||
/* TODO: Maybe the following is not needed if pages are COW */
|
|
||||||
vma->vm_flags &= ~VM_MAYWRITE;
|
|
||||||
|
|
||||||
lock_sock(sk);
|
|
||||||
|
|
||||||
ret = -ENOTCONN;
|
|
||||||
if (sk->sk_state == TCP_LISTEN)
|
if (sk->sk_state == TCP_LISTEN)
|
||||||
goto out;
|
return -ENOTCONN;
|
||||||
|
|
||||||
sock_rps_record_flow(sk);
|
sock_rps_record_flow(sk);
|
||||||
|
|
||||||
if (tcp_inq(sk) < size) {
|
down_read(¤t->mm->mmap_sem);
|
||||||
ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
|
|
||||||
|
ret = -EINVAL;
|
||||||
|
vma = find_vma(current->mm, address);
|
||||||
|
if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
|
||||||
goto out;
|
goto out;
|
||||||
}
|
zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
|
||||||
|
|
||||||
tp = tcp_sk(sk);
|
tp = tcp_sk(sk);
|
||||||
seq = tp->copied_seq;
|
seq = tp->copied_seq;
|
||||||
/* Abort if urgent data is in the area */
|
zc->length = min_t(u32, zc->length, tcp_inq(sk));
|
||||||
if (unlikely(tp->urg_data)) {
|
zc->length &= ~(PAGE_SIZE - 1);
|
||||||
u32 urg_offset = tp->urg_seq - seq;
|
|
||||||
|
|
||||||
ret = -EINVAL;
|
zap_page_range(vma, address, zc->length);
|
||||||
if (urg_offset < size)
|
|
||||||
goto out;
|
zc->recv_skip_hint = 0;
|
||||||
}
|
ret = 0;
|
||||||
ret = -ENOMEM;
|
while (length + PAGE_SIZE <= zc->length) {
|
||||||
pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
|
if (zc->recv_skip_hint < PAGE_SIZE) {
|
||||||
GFP_KERNEL);
|
if (skb) {
|
||||||
if (!pages_array)
|
skb = skb->next;
|
||||||
goto out;
|
offset = seq - TCP_SKB_CB(skb)->seq;
|
||||||
skb = tcp_recv_skb(sk, seq, &offset);
|
} else {
|
||||||
ret = -EINVAL;
|
skb = tcp_recv_skb(sk, seq, &offset);
|
||||||
skb_start:
|
}
|
||||||
/* We do not support anything not in page frags */
|
|
||||||
offset -= skb_headlen(skb);
|
zc->recv_skip_hint = skb->len - offset;
|
||||||
if ((int)offset < 0)
|
offset -= skb_headlen(skb);
|
||||||
goto out;
|
if ((int)offset < 0 || skb_has_frag_list(skb))
|
||||||
if (skb_has_frag_list(skb))
|
break;
|
||||||
goto out;
|
frags = skb_shinfo(skb)->frags;
|
||||||
len = skb->data_len - offset;
|
while (offset) {
|
||||||
frags = skb_shinfo(skb)->frags;
|
if (frags->size > offset)
|
||||||
while (offset) {
|
goto out;
|
||||||
if (frags->size > offset)
|
offset -= frags->size;
|
||||||
goto out;
|
frags++;
|
||||||
offset -= frags->size;
|
}
|
||||||
|
}
|
||||||
|
if (frags->size != PAGE_SIZE || frags->page_offset)
|
||||||
|
break;
|
||||||
|
ret = vm_insert_page(vma, address + length,
|
||||||
|
skb_frag_page(frags));
|
||||||
|
if (ret)
|
||||||
|
break;
|
||||||
|
length += PAGE_SIZE;
|
||||||
|
seq += PAGE_SIZE;
|
||||||
|
zc->recv_skip_hint -= PAGE_SIZE;
|
||||||
frags++;
|
frags++;
|
||||||
}
|
}
|
||||||
while (nr < nr_pages) {
|
|
||||||
if (len) {
|
|
||||||
if (len < PAGE_SIZE)
|
|
||||||
goto out;
|
|
||||||
if (frags->size != PAGE_SIZE || frags->page_offset)
|
|
||||||
goto out;
|
|
||||||
pages_array[nr++] = skb_frag_page(frags);
|
|
||||||
frags++;
|
|
||||||
len -= PAGE_SIZE;
|
|
||||||
seq += PAGE_SIZE;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
skb = skb->next;
|
|
||||||
offset = seq - TCP_SKB_CB(skb)->seq;
|
|
||||||
goto skb_start;
|
|
||||||
}
|
|
||||||
/* OK, we have a full set of pages ready to be inserted into vma */
|
|
||||||
for (nr = 0; nr < nr_pages; nr++) {
|
|
||||||
ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
|
|
||||||
pages_array[nr]);
|
|
||||||
if (ret)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
/* operation is complete, we can 'consume' all skbs */
|
|
||||||
tp->copied_seq = seq;
|
|
||||||
tcp_rcv_space_adjust(sk);
|
|
||||||
|
|
||||||
/* Clean up data we have read: This will do ACK frames. */
|
|
||||||
tcp_recv_skb(sk, seq, &offset);
|
|
||||||
tcp_cleanup_rbuf(sk, size);
|
|
||||||
|
|
||||||
ret = 0;
|
|
||||||
out:
|
out:
|
||||||
release_sock(sk);
|
up_read(¤t->mm->mmap_sem);
|
||||||
kvfree(pages_array);
|
if (length) {
|
||||||
|
tp->copied_seq = seq;
|
||||||
|
tcp_rcv_space_adjust(sk);
|
||||||
|
|
||||||
|
/* Clean up data we have read: This will do ACK frames. */
|
||||||
|
tcp_recv_skb(sk, seq, &offset);
|
||||||
|
tcp_cleanup_rbuf(sk, length);
|
||||||
|
ret = 0;
|
||||||
|
if (length == zc->length)
|
||||||
|
zc->recv_skip_hint = 0;
|
||||||
|
} else {
|
||||||
|
if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
|
||||||
|
ret = -EIO;
|
||||||
|
}
|
||||||
|
zc->length = length;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(tcp_mmap);
|
#endif
|
||||||
|
|
||||||
static void tcp_update_recv_tstamps(struct sk_buff *skb,
|
static void tcp_update_recv_tstamps(struct sk_buff *skb,
|
||||||
struct scm_timestamping *tss)
|
struct scm_timestamping *tss)
|
||||||
|
@ -3472,6 +3467,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
#ifdef CONFIG_MMU
|
||||||
|
case TCP_ZEROCOPY_RECEIVE: {
|
||||||
|
struct tcp_zerocopy_receive zc;
|
||||||
|
int err;
|
||||||
|
|
||||||
|
if (get_user(len, optlen))
|
||||||
|
return -EFAULT;
|
||||||
|
if (len != sizeof(zc))
|
||||||
|
return -EINVAL;
|
||||||
|
if (copy_from_user(&zc, optval, len))
|
||||||
|
return -EFAULT;
|
||||||
|
lock_sock(sk);
|
||||||
|
err = tcp_zerocopy_receive(sk, &zc);
|
||||||
|
release_sock(sk);
|
||||||
|
if (!err && copy_to_user(optval, &zc, len))
|
||||||
|
err = -EFAULT;
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
default:
|
default:
|
||||||
return -ENOPROTOOPT;
|
return -ENOPROTOOPT;
|
||||||
}
|
}
|
||||||
|
|
|
@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
|
||||||
.getsockopt = sock_common_getsockopt, /* ok */
|
.getsockopt = sock_common_getsockopt, /* ok */
|
||||||
.sendmsg = inet_sendmsg, /* ok */
|
.sendmsg = inet_sendmsg, /* ok */
|
||||||
.recvmsg = inet_recvmsg, /* ok */
|
.recvmsg = inet_recvmsg, /* ok */
|
||||||
|
#ifdef CONFIG_MMU
|
||||||
.mmap = tcp_mmap,
|
.mmap = tcp_mmap,
|
||||||
|
#endif
|
||||||
.sendpage = inet_sendpage,
|
.sendpage = inet_sendpage,
|
||||||
.sendmsg_locked = tcp_sendmsg_locked,
|
.sendmsg_locked = tcp_sendmsg_locked,
|
||||||
.sendpage_locked = tcp_sendpage_locked,
|
.sendpage_locked = tcp_sendpage_locked,
|
||||||
|
|
Loading…
Reference in New Issue