Snap for 9769031 from 32991d5d99 to android13-5.15-keystone-qcom-release

Change-Id: I5f9e3d1946f114fa957983c1194bc8a45a498cb8
This commit is contained in:
Android Build Coastguard Worker
2023-03-18 00:00:26 +00:00
12 changed files with 341 additions and 122 deletions

View File

@@ -7244,9 +7244,6 @@ F: include/linux/fs.h
F: include/linux/fs_types.h
F: include/uapi/linux/fs.h
F: include/uapi/linux/openat2.h
X: fs/io-wq.c
X: fs/io-wq.h
X: fs/io_uring.c
FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
M: Riku Voipio <riku.voipio@iki.fi>
@@ -9825,9 +9822,7 @@ L: io-uring@vger.kernel.org
S: Maintained
T: git git://git.kernel.dk/linux-block
T: git git://git.kernel.dk/liburing
F: fs/io-wq.c
F: fs/io-wq.h
F: fs/io_uring.c
F: io_uring/
F: include/linux/io_uring.h
F: include/uapi/linux/io_uring.h
F: tools/io_uring/

View File

@@ -1212,7 +1212,7 @@ endif
$(Q)$(MAKE) $(hdr-inst)=$(hdr-prefix)arch/$(SRCARCH)/include/uapi
ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ io_uring/
vmlinux-dirs := $(patsubst %/,%,$(filter %/, \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \

View File

@@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/

View File

@@ -43,21 +43,7 @@ struct eventfd_ctx {
int id;
};
/**
* eventfd_signal - Adds @n to the eventfd counter.
* @ctx: [in] Pointer to the eventfd context.
* @n: [in] Value of the counter to be added to the eventfd internal counter.
* The value cannot be negative.
*
* This function is supposed to be called by the kernel in paths that do not
* allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
* value, and we signal this as overflow condition by returning a EPOLLERR
* to poll(2).
*
* Returns the amount by which the counter was incremented. This will be less
* than @n if the counter has overflowed.
*/
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
{
unsigned long flags;
@@ -78,12 +64,31 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
current->in_eventfd_signal = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
}
/**
* eventfd_signal - Adds @n to the eventfd counter.
* @ctx: [in] Pointer to the eventfd context.
* @n: [in] Value of the counter to be added to the eventfd internal counter.
* The value cannot be negative.
*
* This function is supposed to be called by the kernel in paths that do not
* allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
* value, and we signal this as overflow condition by returning a EPOLLERR
* to poll(2).
*
* Returns the amount by which the counter was incremented. This will be less
* than @n if the counter has overflowed.
*/
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
{
return eventfd_signal_mask(ctx, n, 0);
}
EXPORT_SYMBOL_GPL(eventfd_signal);
static void eventfd_free_ctx(struct eventfd_ctx *ctx)

View File

@@ -485,7 +485,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
unsigned pollflags)
{
struct eventpoll *ep_src;
unsigned long flags;
@@ -516,16 +517,17 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
}
spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
ep->nests = nests + 1;
wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
ep->nests = 0;
spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}
#else
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
unsigned pollflags)
{
wake_up_poll(&ep->poll_wait, EPOLLIN);
wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}
#endif
@@ -736,7 +738,7 @@ static void ep_free(struct eventpoll *ep)
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
ep_poll_safewake(ep, NULL);
ep_poll_safewake(ep, NULL, 0);
/*
* We need to lock this because we could be hit by
@@ -1202,7 +1204,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(ep, epi);
ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
@@ -1547,7 +1549,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(ep, NULL);
ep_poll_safewake(ep, NULL, 0);
return 0;
}
@@ -1623,7 +1625,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(ep, NULL);
ep_poll_safewake(ep, NULL, 0);
return 0;
}

View File

@@ -40,6 +40,7 @@ struct file *eventfd_fget(int fd);
struct eventfd_ctx *eventfd_ctx_fdget(int fd);
struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask);
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
__u64 *cnt);
void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
@@ -66,6 +67,12 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx, int n)
return -ENOSYS;
}
static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n,
unsigned mask)
{
return -ENOSYS;
}
static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
{

View File

@@ -41,6 +41,12 @@
#define EPOLLMSG (__force __poll_t)0x00000400
#define EPOLLRDHUP (__force __poll_t)0x00002000
/*
* Internal flag - wakeup generated by io_uring, used to detect recursion back
* into the io_uring poll handler.
*/
#define EPOLL_URING_WAKE ((__force __poll_t)(1U << 27))
/* Set exclusive wakeup mode for the target file descriptor */
#define EPOLLEXCLUSIVE ((__force __poll_t)(1U << 28))

6
io_uring/Makefile Normal file
View File

@@ -0,0 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
#
# Makefile for io_uring
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o

View File

@@ -513,7 +513,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
static bool io_flush_signals(void)
{
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
__set_current_state(TASK_RUNNING);
tracehook_notify_signal();
return true;
@@ -1217,6 +1217,12 @@ static void io_wq_cancel_tw_create(struct io_wq *wq)
worker = container_of(cb, struct io_worker, create_work);
io_worker_cancel_cb(worker);
/*
* Only the worker continuation helper has worker allocated and
* hence needs freeing.
*/
if (cb->func == create_worker_cont)
kfree(worker);
}
}

View File

@@ -85,7 +85,7 @@
#include <uapi/linux/io_uring.h>
#include "internal.h"
#include "../fs/internal.h"
#include "io-wq.h"
#define IORING_MAX_ENTRIES 32768
@@ -578,6 +578,7 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
size_t done_io;
struct io_buffer *kbuf;
};
@@ -739,6 +740,7 @@ enum {
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
REQ_F_PARTIAL_IO_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_NOWAIT_READ_BIT,
REQ_F_NOWAIT_WRITE_BIT,
@@ -794,6 +796,8 @@ enum {
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
/* request has already done partial IO */
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
};
struct async_poll {
@@ -1091,7 +1095,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
unsigned nr_args);
static void io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_ring_ctx *ctx,
struct io_kiocb *req, int fd, bool fixed);
struct io_kiocb *req, int fd, bool fixed,
unsigned int issue_flags);
static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
@@ -1629,13 +1634,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
* wake as many waiters as we need to.
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
__wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE);
if (waitqueue_active(&ctx->poll_wait))
wake_up_interruptible(&ctx->poll_wait);
__wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0,
poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1645,12 +1652,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
if (ctx->flags & IORING_SETUP_SQPOLL) {
if (waitqueue_active(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
__wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE);
if (waitqueue_active(&ctx->poll_wait))
wake_up_interruptible(&ctx->poll_wait);
__wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0,
poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}
/* Returns true if there are no backlogged entries after the flush */
@@ -2477,12 +2486,26 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
io_init_req_batch(&rb);
while (!list_empty(done)) {
struct io_uring_cqe *cqe;
unsigned cflags;
req = list_first_entry(done, struct io_kiocb, inflight_entry);
list_del(&req->inflight_entry);
io_fill_cqe_req(req, req->result, io_put_rw_kbuf(req));
cflags = io_put_rw_kbuf(req);
(*nr_events)++;
cqe = io_get_cqe(ctx);
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
WRITE_ONCE(cqe->flags, cflags);
} else {
spin_lock(&ctx->completion_lock);
io_cqring_event_overflow(ctx, req->user_data,
req->result, cflags);
spin_unlock(&ctx->completion_lock);
}
if (req_ref_put_and_test(req))
io_req_free_batch(&rb, req, &ctx->submit_state);
}
@@ -2681,17 +2704,32 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
}
#endif
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
/*
* Trigger the notifications after having done some IO, and finish the write
* accounting, if any.
*/
static void io_req_io_end(struct io_kiocb *req)
{
if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
struct io_rw *rw = &req->rw;
if (rw->kiocb.ki_flags & IOCB_WRITE) {
kiocb_end_write(req);
fsnotify_modify(req->file);
} else {
fsnotify_access(req->file);
}
}
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
if (res != req->result) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
/*
* Reissue will start accounting again, finish the
* current cycle.
*/
io_req_io_end(req);
req->flags |= REQ_F_REISSUE;
return true;
}
@@ -2701,7 +2739,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
return false;
}
static inline int io_fixup_rw_res(struct io_kiocb *req, unsigned res)
static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
{
struct io_async_rw *io = req->async_data;
@@ -2733,12 +2771,10 @@ static void io_req_task_complete(struct io_kiocb *req, bool *locked)
}
}
static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
unsigned int issue_flags)
static void io_req_rw_complete(struct io_kiocb *req, bool *locked)
{
if (__io_complete_rw_common(req, res))
return;
__io_req_complete(req, issue_flags, io_fixup_rw_res(req, res), io_put_rw_kbuf(req));
io_req_io_end(req);
io_req_task_complete(req, locked);
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
@@ -2748,7 +2784,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
if (__io_complete_rw_common(req, res))
return;
req->result = io_fixup_rw_res(req, res);
req->io_task_work.func = io_req_task_complete;
req->io_task_work.func = io_req_rw_complete;
io_req_task_work_add(req);
}
@@ -2900,14 +2936,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->flags |= REQ_F_ISREG;
kiocb->ki_pos = READ_ONCE(sqe->off);
if (kiocb->ki_pos == -1) {
if (!(file->f_mode & FMODE_STREAM)) {
req->flags |= REQ_F_CUR_POS;
kiocb->ki_pos = file->f_pos;
} else {
kiocb->ki_pos = 0;
}
}
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
@@ -2989,6 +3017,23 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
{
struct kiocb *kiocb = &req->rw.kiocb;
if (kiocb->ki_pos != -1)
return &kiocb->ki_pos;
if (!(req->file->f_mode & FMODE_STREAM)) {
req->flags |= REQ_F_CUR_POS;
kiocb->ki_pos = req->file->f_pos;
return &kiocb->ki_pos;
}
kiocb->ki_pos = 0;
return NULL;
}
static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
unsigned int issue_flags)
{
@@ -2996,10 +3041,20 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
__io_complete_rw(req, ret, 0, issue_flags);
else
if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) {
if (!__io_complete_rw_common(req, ret)) {
/*
* Safe to call io_end from here as we're inline
* from the submission path.
*/
io_req_io_end(req);
__io_req_complete(req, issue_flags,
io_fixup_rw_res(req, ret),
io_put_rw_kbuf(req));
}
} else {
io_rw_done(kiocb, ret);
}
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
@@ -3281,6 +3336,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
struct kiocb *kiocb = &req->rw.kiocb;
struct file *file = req->file;
ssize_t ret = 0;
loff_t *ppos;
/*
* Don't support polled IO through this interface, and we can't
@@ -3292,6 +3348,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
if (kiocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
ppos = io_kiocb_ppos(kiocb);
while (iov_iter_count(iter)) {
struct iovec iovec;
ssize_t nr;
@@ -3305,10 +3363,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
iovec.iov_len, io_kiocb_ppos(kiocb));
iovec.iov_len, ppos);
} else {
nr = file->f_op->write(file, iovec.iov_base,
iovec.iov_len, io_kiocb_ppos(kiocb));
iovec.iov_len, ppos);
}
if (nr < 0) {
@@ -3509,6 +3567,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct iov_iter_state __state, *state;
ssize_t ret, ret2;
loff_t *ppos;
if (rw) {
iter = &rw->iter;
@@ -3541,7 +3600,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
return ret ?: -EAGAIN;
}
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
ppos = io_kiocb_update_pos(req);
ret = rw_verify_area(READ, req->file, ppos, req->result);
if (unlikely(ret)) {
kfree(iovec);
return ret;
@@ -3645,6 +3706,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct iov_iter_state __state, *state;
ssize_t ret, ret2;
loff_t *ppos;
if (rw) {
iter = &rw->iter;
@@ -3675,7 +3737,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
(req->flags & REQ_F_ISREG))
goto copy_iov;
ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
ppos = io_kiocb_update_pos(req);
ret = rw_verify_area(WRITE, req->file, ppos, req->result);
if (unlikely(ret))
goto out_free;
@@ -4058,7 +4122,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
in = io_file_get(req->ctx, req, sp->splice_fd_in,
(sp->flags & SPLICE_F_FD_IN_FIXED));
(sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
if (!in) {
ret = -EBADF;
goto done;
@@ -4098,7 +4162,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
in = io_file_get(req->ctx, req, sp->splice_fd_in,
(sp->flags & SPLICE_F_FD_IN_FIXED));
(sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
if (!in) {
ret = -EBADF;
goto done;
@@ -4755,6 +4819,13 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
}
#if defined(CONFIG_NET)
static bool io_net_retry(struct socket *sock, int flags)
{
if (!(flags & MSG_WAITALL))
return false;
return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
}
static int io_setup_async_msg(struct io_kiocb *req,
struct io_async_msghdr *kmsg)
{
@@ -4772,8 +4843,10 @@ static int io_setup_async_msg(struct io_kiocb *req,
if (async_msg->msg.msg_name)
async_msg->msg.msg_name = &async_msg->addr;
/* if were using fast_iov, set it to the new one */
if (!async_msg->free_iov)
async_msg->msg.msg_iter.iov = async_msg->fast_iov;
if (!kmsg->free_iov) {
size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
}
return -EAGAIN;
}
@@ -4818,12 +4891,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
sr->done_io = 0;
return 0;
}
static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
unsigned flags;
int min_ret = 0;
@@ -4848,17 +4923,27 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
if (ret < min_ret) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
req->flags |= REQ_F_PARTIAL_IO;
return io_setup_async_msg(req, kmsg);
}
req_set_fail(req);
}
/* fast path, check for non-NULL to avoid function call */
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < min_ret)
req_set_fail(req);
if (ret >= 0)
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4894,13 +4979,24 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_flags = flags;
ret = sock_sendmsg(sock, &msg);
if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
if (ret < min_ret)
if (ret < min_ret) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
if (ret > 0 && io_net_retry(sock, flags)) {
sr->len -= ret;
sr->buf += ret;
sr->done_io += ret;
req->flags |= REQ_F_PARTIAL_IO;
return -EAGAIN;
}
req_set_fail(req);
}
if (ret >= 0)
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -5044,12 +5140,14 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
sr->done_io = 0;
return 0;
}
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
struct io_buffer *kbuf;
unsigned flags;
@@ -5087,10 +5185,20 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
kmsg->uaddr, flags);
if (force_nonblock && ret == -EAGAIN)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
req->flags |= REQ_F_PARTIAL_IO;
return io_setup_async_msg(req, kmsg);
}
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
req_set_fail(req);
}
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_recv_kbuf(req);
@@ -5098,8 +5206,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
req_set_fail(req);
if (ret >= 0)
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
@@ -5146,15 +5256,29 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
min_ret = iov_iter_count(&msg.msg_iter);
ret = sock_recvmsg(sock, &msg, flags);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
if (ret > 0 && io_net_retry(sock, flags)) {
sr->len -= ret;
sr->buf += ret;
sr->done_io += ret;
req->flags |= REQ_F_PARTIAL_IO;
return -EAGAIN;
}
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
req_set_fail(req);
}
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_recv_kbuf(req);
if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
req_set_fail(req);
if (ret >= 0)
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
@@ -5192,9 +5316,6 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
if (req->file->f_flags & O_NONBLOCK)
req->flags |= REQ_F_NOWAIT;
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
if (unlikely(fd < 0))
@@ -5206,6 +5327,8 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
if (!fixed)
put_unused_fd(fd);
ret = PTR_ERR(file);
/* safe to retry */
req->flags |= REQ_F_PARTIAL_IO;
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
if (ret == -ERESTARTSYS)
@@ -5322,7 +5445,29 @@ struct io_poll_table {
};
#define IO_POLL_CANCEL_FLAG BIT(31)
#define IO_POLL_REF_MASK GENMASK(30, 0)
#define IO_POLL_RETRY_FLAG BIT(30)
#define IO_POLL_REF_MASK GENMASK(29, 0)
/*
* We usually have 1-2 refs taken, 128 is more than enough and we want to
* maximise the margin between this amount and the moment when it overflows.
*/
#define IO_POLL_REF_BIAS 128
static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
{
int v;
/*
* poll_refs are already elevated and we don't have much hope for
* grabbing the ownership. Instead of incrementing set a retry flag
* to notify the loop that there might have been some change.
*/
v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
if (v & IO_POLL_REF_MASK)
return false;
return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}
/*
* If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
@@ -5332,6 +5477,8 @@ struct io_poll_table {
*/
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
return io_poll_get_ownership_slowpath(req);
return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}
@@ -5440,6 +5587,23 @@ static int io_poll_check_events(struct io_kiocb *req)
return 0;
if (v & IO_POLL_CANCEL_FLAG)
return -ECANCELED;
/*
* cqe.res contains only events of the first wake up
* and all others are be lost. Redo vfs_poll() to get
* up to date state.
*/
if ((v & IO_POLL_REF_MASK) != 1)
req->result = 0;
if (v & IO_POLL_RETRY_FLAG) {
req->result = 0;
/*
* We won't find new events that came in between
* vfs_poll and the ref put unless we clear the
* flag in advance.
*/
atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
v &= ~IO_POLL_RETRY_FLAG;
}
if (!req->result) {
struct poll_table_struct pt = { ._key = poll->events };
@@ -5464,11 +5628,15 @@ static int io_poll_check_events(struct io_kiocb *req)
return 0;
}
/* force the next iteration to vfs_poll() */
req->result = 0;
/*
* Release all references, retry if someone tried to restart
* task_work while we were executing it.
*/
} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) &
IO_POLL_REF_MASK);
return 1;
}
@@ -5577,8 +5745,17 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & poll->events))
return 0;
if (io_poll_get_ownership(req))
if (io_poll_get_ownership(req)) {
/*
* If we trigger a multishot poll off our own wakeup path,
* disable multishot as there is a circular dependency between
* CQ posting and triggering the event.
*/
if (mask & EPOLL_URING_WAKE)
poll->events |= EPOLLONESHOT;
__io_poll_execute(req, mask);
}
return 1;
}
@@ -5640,7 +5817,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
struct io_poll_table *ipt, __poll_t mask)
{
struct io_ring_ctx *ctx = req->ctx;
int v;
INIT_HLIST_NODE(&req->hash_node);
io_init_poll_iocb(poll, mask, io_poll_wake);
@@ -5686,11 +5862,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
}
/*
* Release ownership. If someone tried to queue a tw while it was
* locked, kick it off for them.
* Try to release ownership. If we see a change of state, e.g.
* poll was waken up, queue up a tw, it'll deal with it.
*/
v = atomic_dec_return(&req->poll_refs);
if (unlikely(v & IO_POLL_REF_MASK))
if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
__io_poll_execute(req, 0);
return 0;
}
@@ -5721,7 +5896,7 @@ static int io_arm_poll_handler(struct io_kiocb *req)
if (!req->file || !file_can_poll(req->file))
return IO_APOLL_ABORTED;
if (req->flags & REQ_F_POLLED)
if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
return IO_APOLL_ABORTED;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
@@ -5737,7 +5912,12 @@ static int io_arm_poll_handler(struct io_kiocb *req)
mask |= POLLOUT | POLLWRNORM;
}
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (req->flags & REQ_F_POLLED) {
apoll = req->apoll;
kfree(apoll->double_poll);
} else {
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
}
if (unlikely(!apoll))
return IO_APOLL_ABORTED;
apoll->double_poll = NULL;
@@ -6868,13 +7048,16 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
}
static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
struct io_kiocb *req, int fd)
struct io_kiocb *req, int fd,
unsigned int issue_flags)
{
struct file *file;
struct file *file = NULL;
unsigned long file_ptr;
io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
return NULL;
goto out;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK);
@@ -6882,6 +7065,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
io_req_set_rsrc_node(req);
out:
io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
return file;
}
@@ -6899,10 +7084,11 @@ static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
}
static inline struct file *io_file_get(struct io_ring_ctx *ctx,
struct io_kiocb *req, int fd, bool fixed)
struct io_kiocb *req, int fd, bool fixed,
unsigned int issue_flags)
{
if (fixed)
return io_file_get_fixed(ctx, req, fd);
return io_file_get_fixed(ctx, req, fd, issue_flags);
else
return io_file_get_normal(ctx, req, fd);
}
@@ -7124,7 +7310,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (io_op_defs[req->opcode].needs_file) {
req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
(sqe_flags & IOSQE_FIXED_FILE));
(sqe_flags & IOSQE_FIXED_FILE),
IO_URING_F_NONBLOCK);
if (unlikely(!req->file))
ret = -EBADF;
}
@@ -7555,7 +7742,7 @@ static int io_run_task_work_sig(void)
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
ktime_t timeout)
ktime_t *timeout)
{
int ret;
@@ -7567,7 +7754,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (test_bit(0, &ctx->check_cq_overflow))
return 1;
if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
return -ETIME;
return 1;
}
@@ -7630,7 +7817,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
}
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
@@ -9424,8 +9611,10 @@ static void io_tctx_exit_cb(struct callback_head *cb)
/*
* When @in_idle, we're in cancellation and it's racy to remove the
* node. It'll be removed by the end of cancellation, just ignore it.
* tctx can be NULL if the queueing of this task_work raced with
* work cancelation off the exec path.
*/
if (!atomic_read(&tctx->in_idle))
if (tctx && !atomic_read(&tctx->in_idle))
io_uring_del_tctx_node((unsigned long)work->ctx);
complete(&work->completion);
}
@@ -9548,6 +9737,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
/* if we failed setting up the ctx, we might not have any rings */
io_iopoll_try_reap_events(ctx);
/* drop cached put refs after potentially doing completions */
if (current->io_uring)
io_uring_drop_tctx_refs(current);
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
/*
* Use system_unbound_wq to avoid spawning tons of event kworkers
@@ -10850,8 +11043,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return -ENXIO;
if (ctx->restricted) {
if (opcode >= IORING_REGISTER_LAST)
return -EINVAL;
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
if (!test_bit(opcode, ctx->restrictions.register_op))
return -EACCES;
@@ -10983,6 +11174,9 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
long ret = -EBADF;
struct fd f;
if (opcode >= IORING_REGISTER_LAST)
return -EINVAL;
f = fdget(fd);
if (!f.file)
return -EBADF;

View File

@@ -21,7 +21,7 @@
#include <asm/tlb.h>
#include "../workqueue_internal.h"
#include "../../fs/io-wq.h"
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
#include "pelt.h"