linux/net/core/stream.c
Eric Dumazet ef8d8ccdc2 tcp: make sure EPOLLOUT wont be missed
As Jason Baron explained in commit 790ba4566c ("tcp: set SOCK_NOSPACE
under memory pressure"), it is crucial we properly set SOCK_NOSPACE
when needed.

However, Jason patch had a bug, because the 'nonblocking' status
as far as sk_stream_wait_memory() is concerned is governed
by MSG_DONTWAIT flag passed at sendmsg() time :

    long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);

So it is very possible that tcp sendmsg() calls sk_stream_wait_memory(),
and that sk_stream_wait_memory() returns -EAGAIN with SOCK_NOSPACE
cleared, if sk->sk_sndtimeo has been set to a small (but not zero)
value.

This patch removes the 'noblock' variable since we must always
set SOCK_NOSPACE if -EAGAIN is returned.

It also renames the do_nonblock label since we might reach this
code path even if we were in blocking mode.

Fixes: 790ba4566c ("tcp: set SOCK_NOSPACE under memory pressure")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jason Baron <jbaron@akamai.com>
Reported-by: Vladimir Rutsky  <rutsky@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-19 13:07:43 -07:00

216 lines
5.4 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* SUCS NET3:
*
* Generic stream handling routines. These are generic for most
* protocols. Even IP. Tonight 8-).
* This is used because TCP, LLC (others too) layer all have mostly
* identical sendmsg() and recvmsg() code.
* So we (will) share it here.
*
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
* (from old tcp.c code)
* Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
*/
#include <linux/module.h>
#include <linux/sched/signal.h>
#include <linux/net.h>
#include <linux/signal.h>
#include <linux/tcp.h>
#include <linux/wait.h>
#include <net/sock.h>
/**
* sk_stream_write_space - stream socket write_space callback.
* @sk: socket
*
* FIXME: write proper description
*/
void sk_stream_write_space(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
struct socket_wq *wq;
if (__sk_stream_is_writeable(sk, 1) && sock) {
clear_bit(SOCK_NOSPACE, &sock->flags);
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_poll(&wq->wait, EPOLLOUT |
EPOLLWRNORM | EPOLLWRBAND);
if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock();
}
}
/**
* sk_stream_wait_connect - Wait for a socket to get into the connected state
* @sk: sock to wait on
* @timeo_p: for how long to wait
*
* Must be called with the socket locked.
*/
int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct task_struct *tsk = current;
int done;
do {
int err = sock_error(sk);
if (err)
return err;
if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
return -EPIPE;
if (!*timeo_p)
return -EAGAIN;
if (signal_pending(tsk))
return sock_intr_errno(*timeo_p);
add_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
!sk->sk_err &&
!((1 << sk->sk_state) &
~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
remove_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending--;
} while (!done);
return 0;
}
EXPORT_SYMBOL(sk_stream_wait_connect);
/**
* sk_stream_closing - Return 1 if we still have things to send in our buffers.
* @sk: socket to verify
*/
static inline int sk_stream_closing(struct sock *sk)
{
return (1 << sk->sk_state) &
(TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
}
void sk_stream_wait_close(struct sock *sk, long timeout)
{
if (timeout) {
DEFINE_WAIT_FUNC(wait, woken_wake_function);
add_wait_queue(sk_sleep(sk), &wait);
do {
if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait))
break;
} while (!signal_pending(current) && timeout);
remove_wait_queue(sk_sleep(sk), &wait);
}
}
EXPORT_SYMBOL(sk_stream_wait_close);
/**
* sk_stream_wait_memory - Wait for more memory for a socket
* @sk: socket to wait for memory
* @timeo_p: for how long
*/
int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
{
int err = 0;
long vm_wait = 0;
long current_timeo = *timeo_p;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
add_wait_queue(sk_sleep(sk), &wait);
while (1) {
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
if (!*timeo_p)
goto do_eagain;
if (signal_pending(current))
goto do_interrupted;
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
if (sk_stream_memory_free(sk) && !vm_wait)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
sk_wait_event(sk, &current_timeo, sk->sk_err ||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
(sk_stream_memory_free(sk) &&
!vm_wait), &wait);
sk->sk_write_pending--;
if (vm_wait) {
vm_wait -= current_timeo;
current_timeo = *timeo_p;
if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
(current_timeo -= vm_wait) < 0)
current_timeo = 0;
vm_wait = 0;
}
*timeo_p = current_timeo;
}
out:
remove_wait_queue(sk_sleep(sk), &wait);
return err;
do_error:
err = -EPIPE;
goto out;
do_eagain:
/* Make sure that whenever EAGAIN is returned, EPOLLOUT event can
* be generated later.
* When TCP receives ACK packets that make room, tcp_check_space()
* only calls tcp_new_space() if SOCK_NOSPACE is set.
*/
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
goto out;
do_interrupted:
err = sock_intr_errno(*timeo_p);
goto out;
}
EXPORT_SYMBOL(sk_stream_wait_memory);
int sk_stream_error(struct sock *sk, int flags, int err)
{
if (err == -EPIPE)
err = sock_error(sk) ? : -EPIPE;
if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
send_sig(SIGPIPE, current, 0);
return err;
}
EXPORT_SYMBOL(sk_stream_error);
void sk_stream_kill_queues(struct sock *sk)
{
/* First the read buffer. */
__skb_queue_purge(&sk->sk_receive_queue);
/* Next, the error queue. */
__skb_queue_purge(&sk->sk_error_queue);
/* Next, the write queue. */
WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
/* Account for returned memory. */
sk_mem_reclaim(sk);
WARN_ON(sk->sk_wmem_queued);
WARN_ON(sk->sk_forward_alloc);
/* It is _impossible_ for the backlog to contain anything
* when we get here. All user references to this socket
* have gone away, only the net layer knows can touch it.
*/
}
EXPORT_SYMBOL(sk_stream_kill_queues);