13b77495aSJens Axboe // SPDX-License-Identifier: GPL-2.0 23b77495aSJens Axboe #include <linux/kernel.h> 33b77495aSJens Axboe #include <linux/errno.h> 43b77495aSJens Axboe #include <linux/fs.h> 53b77495aSJens Axboe #include <linux/file.h> 63b77495aSJens Axboe #include <linux/mm.h> 73b77495aSJens Axboe #include <linux/slab.h> 83b77495aSJens Axboe #include <linux/namei.h> 93b77495aSJens Axboe #include <linux/poll.h> 103b77495aSJens Axboe #include <linux/io_uring.h> 113b77495aSJens Axboe 123b77495aSJens Axboe #include <uapi/linux/io_uring.h> 133b77495aSJens Axboe 143b77495aSJens Axboe #include "io_uring_types.h" 153b77495aSJens Axboe #include "io_uring.h" 163b77495aSJens Axboe #include "opdef.h" 173b77495aSJens Axboe #include "kbuf.h" 183b77495aSJens Axboe 193b77495aSJens Axboe #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) 203b77495aSJens Axboe 213b77495aSJens Axboe #define BGID_ARRAY 64 223b77495aSJens Axboe 233b77495aSJens Axboe struct io_provide_buf { 243b77495aSJens Axboe struct file *file; 253b77495aSJens Axboe __u64 addr; 263b77495aSJens Axboe __u32 len; 273b77495aSJens Axboe __u32 bgid; 283b77495aSJens Axboe __u16 nbufs; 293b77495aSJens Axboe __u16 bid; 303b77495aSJens Axboe }; 313b77495aSJens Axboe 323b77495aSJens Axboe static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 333b77495aSJens Axboe unsigned int bgid) 343b77495aSJens Axboe { 353b77495aSJens Axboe if (ctx->io_bl && bgid < BGID_ARRAY) 363b77495aSJens Axboe return &ctx->io_bl[bgid]; 373b77495aSJens Axboe 383b77495aSJens Axboe return xa_load(&ctx->io_bl_xa, bgid); 393b77495aSJens Axboe } 403b77495aSJens Axboe 413b77495aSJens Axboe void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) 423b77495aSJens Axboe { 433b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 443b77495aSJens Axboe struct io_buffer_list *bl; 453b77495aSJens Axboe struct io_buffer *buf; 463b77495aSJens Axboe 473b77495aSJens Axboe /* 483b77495aSJens Axboe * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear 493b77495aSJens Axboe * the flag and hence ensure that bl->head doesn't get incremented. 503b77495aSJens Axboe * If the tail has already been incremented, hang on to it. 513b77495aSJens Axboe */ 523b77495aSJens Axboe if (req->flags & REQ_F_BUFFER_RING) { 533b77495aSJens Axboe if (req->buf_list) { 543b77495aSJens Axboe if (req->flags & REQ_F_PARTIAL_IO) { 55*f09c8643SHao Xu /* 56*f09c8643SHao Xu * If we end up here, then the io_uring_lock has 57*f09c8643SHao Xu * been kept held since we retrieved the buffer. 58*f09c8643SHao Xu * For the io-wq case, we already cleared 59*f09c8643SHao Xu * req->buf_list when the buffer was retrieved, 60*f09c8643SHao Xu * hence it cannot be set here for that case. 61*f09c8643SHao Xu */ 623b77495aSJens Axboe req->buf_list->head++; 633b77495aSJens Axboe req->buf_list = NULL; 643b77495aSJens Axboe } else { 653b77495aSJens Axboe req->buf_index = req->buf_list->bgid; 663b77495aSJens Axboe req->flags &= ~REQ_F_BUFFER_RING; 673b77495aSJens Axboe } 683b77495aSJens Axboe } 693b77495aSJens Axboe return; 703b77495aSJens Axboe } 713b77495aSJens Axboe 723b77495aSJens Axboe io_ring_submit_lock(ctx, issue_flags); 733b77495aSJens Axboe 743b77495aSJens Axboe buf = req->kbuf; 753b77495aSJens Axboe bl = io_buffer_get_list(ctx, buf->bgid); 763b77495aSJens Axboe list_add(&buf->list, &bl->buf_list); 773b77495aSJens Axboe req->flags &= ~REQ_F_BUFFER_SELECTED; 783b77495aSJens Axboe req->buf_index = buf->bgid; 793b77495aSJens Axboe 803b77495aSJens Axboe io_ring_submit_unlock(ctx, issue_flags); 813b77495aSJens Axboe } 823b77495aSJens Axboe 833b77495aSJens Axboe static int io_buffer_add_list(struct io_ring_ctx *ctx, 843b77495aSJens Axboe struct io_buffer_list *bl, unsigned int bgid) 853b77495aSJens Axboe { 863b77495aSJens Axboe bl->bgid = bgid; 873b77495aSJens Axboe if (bgid < BGID_ARRAY) 883b77495aSJens Axboe return 0; 893b77495aSJens Axboe 903b77495aSJens Axboe return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 913b77495aSJens Axboe } 923b77495aSJens Axboe 9353ccf69bSPavel Begunkov unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) 9453ccf69bSPavel Begunkov { 9553ccf69bSPavel Begunkov unsigned int cflags; 9653ccf69bSPavel Begunkov 9753ccf69bSPavel Begunkov /* 9853ccf69bSPavel Begunkov * We can add this buffer back to two lists: 9953ccf69bSPavel Begunkov * 10053ccf69bSPavel Begunkov * 1) The io_buffers_cache list. This one is protected by the 10153ccf69bSPavel Begunkov * ctx->uring_lock. If we already hold this lock, add back to this 10253ccf69bSPavel Begunkov * list as we can grab it from issue as well. 10353ccf69bSPavel Begunkov * 2) The io_buffers_comp list. This one is protected by the 10453ccf69bSPavel Begunkov * ctx->completion_lock. 10553ccf69bSPavel Begunkov * 10653ccf69bSPavel Begunkov * We migrate buffers from the comp_list to the issue cache list 10753ccf69bSPavel Begunkov * when we need one. 10853ccf69bSPavel Begunkov */ 10953ccf69bSPavel Begunkov if (req->flags & REQ_F_BUFFER_RING) { 11053ccf69bSPavel Begunkov /* no buffers to recycle for this case */ 11153ccf69bSPavel Begunkov cflags = __io_put_kbuf_list(req, NULL); 11253ccf69bSPavel Begunkov } else if (issue_flags & IO_URING_F_UNLOCKED) { 11353ccf69bSPavel Begunkov struct io_ring_ctx *ctx = req->ctx; 11453ccf69bSPavel Begunkov 11553ccf69bSPavel Begunkov spin_lock(&ctx->completion_lock); 11653ccf69bSPavel Begunkov cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); 11753ccf69bSPavel Begunkov spin_unlock(&ctx->completion_lock); 11853ccf69bSPavel Begunkov } else { 11953ccf69bSPavel Begunkov lockdep_assert_held(&req->ctx->uring_lock); 12053ccf69bSPavel Begunkov 12153ccf69bSPavel Begunkov cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); 12253ccf69bSPavel Begunkov } 12353ccf69bSPavel Begunkov return cflags; 12453ccf69bSPavel Begunkov } 12553ccf69bSPavel Begunkov 1263b77495aSJens Axboe static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, 1273b77495aSJens Axboe struct io_buffer_list *bl) 1283b77495aSJens Axboe { 1293b77495aSJens Axboe if (!list_empty(&bl->buf_list)) { 1303b77495aSJens Axboe struct io_buffer *kbuf; 1313b77495aSJens Axboe 1323b77495aSJens Axboe kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); 1333b77495aSJens Axboe list_del(&kbuf->list); 1343b77495aSJens Axboe if (*len > kbuf->len) 1353b77495aSJens Axboe *len = kbuf->len; 1363b77495aSJens Axboe req->flags |= REQ_F_BUFFER_SELECTED; 1373b77495aSJens Axboe req->kbuf = kbuf; 1383b77495aSJens Axboe req->buf_index = kbuf->bid; 1393b77495aSJens Axboe return u64_to_user_ptr(kbuf->addr); 1403b77495aSJens Axboe } 1413b77495aSJens Axboe return NULL; 1423b77495aSJens Axboe } 1433b77495aSJens Axboe 1443b77495aSJens Axboe static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 1453b77495aSJens Axboe struct io_buffer_list *bl, 1463b77495aSJens Axboe unsigned int issue_flags) 1473b77495aSJens Axboe { 1483b77495aSJens Axboe struct io_uring_buf_ring *br = bl->buf_ring; 1493b77495aSJens Axboe struct io_uring_buf *buf; 1503b77495aSJens Axboe __u16 head = bl->head; 1513b77495aSJens Axboe 1523b77495aSJens Axboe if (unlikely(smp_load_acquire(&br->tail) == head)) 1533b77495aSJens Axboe return NULL; 1543b77495aSJens Axboe 1553b77495aSJens Axboe head &= bl->mask; 1563b77495aSJens Axboe if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { 1573b77495aSJens Axboe buf = &br->bufs[head]; 1583b77495aSJens Axboe } else { 1593b77495aSJens Axboe int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); 1603b77495aSJens Axboe int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; 1613b77495aSJens Axboe buf = page_address(bl->buf_pages[index]); 1623b77495aSJens Axboe buf += off; 1633b77495aSJens Axboe } 1643b77495aSJens Axboe if (*len > buf->len) 1653b77495aSJens Axboe *len = buf->len; 1663b77495aSJens Axboe req->flags |= REQ_F_BUFFER_RING; 1673b77495aSJens Axboe req->buf_list = bl; 1683b77495aSJens Axboe req->buf_index = buf->bid; 1693b77495aSJens Axboe 1703b77495aSJens Axboe if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { 1713b77495aSJens Axboe /* 1723b77495aSJens Axboe * If we came in unlocked, we have no choice but to consume the 173*f09c8643SHao Xu * buffer here, otherwise nothing ensures that the buffer won't 174*f09c8643SHao Xu * get used by others. This does mean it'll be pinned until the 175*f09c8643SHao Xu * IO completes, coming in unlocked means we're being called from 176*f09c8643SHao Xu * io-wq context and there may be further retries in async hybrid 177*f09c8643SHao Xu * mode. For the locked case, the caller must call commit when 178*f09c8643SHao Xu * the transfer completes (or if we get -EAGAIN and must poll of 179*f09c8643SHao Xu * retry). 1803b77495aSJens Axboe */ 1813b77495aSJens Axboe req->buf_list = NULL; 1823b77495aSJens Axboe bl->head++; 1833b77495aSJens Axboe } 1843b77495aSJens Axboe return u64_to_user_ptr(buf->addr); 1853b77495aSJens Axboe } 1863b77495aSJens Axboe 1873b77495aSJens Axboe void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 1883b77495aSJens Axboe unsigned int issue_flags) 1893b77495aSJens Axboe { 1903b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 1913b77495aSJens Axboe struct io_buffer_list *bl; 1923b77495aSJens Axboe void __user *ret = NULL; 1933b77495aSJens Axboe 1943b77495aSJens Axboe io_ring_submit_lock(req->ctx, issue_flags); 1953b77495aSJens Axboe 1963b77495aSJens Axboe bl = io_buffer_get_list(ctx, req->buf_index); 1973b77495aSJens Axboe if (likely(bl)) { 1983b77495aSJens Axboe if (bl->buf_nr_pages) 1993b77495aSJens Axboe ret = io_ring_buffer_select(req, len, bl, issue_flags); 2003b77495aSJens Axboe else 2013b77495aSJens Axboe ret = io_provided_buffer_select(req, len, bl); 2023b77495aSJens Axboe } 2033b77495aSJens Axboe io_ring_submit_unlock(req->ctx, issue_flags); 2043b77495aSJens Axboe return ret; 2053b77495aSJens Axboe } 2063b77495aSJens Axboe 2073b77495aSJens Axboe static __cold int io_init_bl_list(struct io_ring_ctx *ctx) 2083b77495aSJens Axboe { 2093b77495aSJens Axboe int i; 2103b77495aSJens Axboe 2113b77495aSJens Axboe ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), 2123b77495aSJens Axboe GFP_KERNEL); 2133b77495aSJens Axboe if (!ctx->io_bl) 2143b77495aSJens Axboe return -ENOMEM; 2153b77495aSJens Axboe 2163b77495aSJens Axboe for (i = 0; i < BGID_ARRAY; i++) { 2173b77495aSJens Axboe INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); 2183b77495aSJens Axboe ctx->io_bl[i].bgid = i; 2193b77495aSJens Axboe } 2203b77495aSJens Axboe 2213b77495aSJens Axboe return 0; 2223b77495aSJens Axboe } 2233b77495aSJens Axboe 2243b77495aSJens Axboe static int __io_remove_buffers(struct io_ring_ctx *ctx, 2253b77495aSJens Axboe struct io_buffer_list *bl, unsigned nbufs) 2263b77495aSJens Axboe { 2273b77495aSJens Axboe unsigned i = 0; 2283b77495aSJens Axboe 2293b77495aSJens Axboe /* shouldn't happen */ 2303b77495aSJens Axboe if (!nbufs) 2313b77495aSJens Axboe return 0; 2323b77495aSJens Axboe 2333b77495aSJens Axboe if (bl->buf_nr_pages) { 2343b77495aSJens Axboe int j; 2353b77495aSJens Axboe 2363b77495aSJens Axboe i = bl->buf_ring->tail - bl->head; 2373b77495aSJens Axboe for (j = 0; j < bl->buf_nr_pages; j++) 2383b77495aSJens Axboe unpin_user_page(bl->buf_pages[j]); 2393b77495aSJens Axboe kvfree(bl->buf_pages); 2403b77495aSJens Axboe bl->buf_pages = NULL; 2413b77495aSJens Axboe bl->buf_nr_pages = 0; 2423b77495aSJens Axboe /* make sure it's seen as empty */ 2433b77495aSJens Axboe INIT_LIST_HEAD(&bl->buf_list); 2443b77495aSJens Axboe return i; 2453b77495aSJens Axboe } 2463b77495aSJens Axboe 2473b77495aSJens Axboe /* the head kbuf is the list itself */ 2483b77495aSJens Axboe while (!list_empty(&bl->buf_list)) { 2493b77495aSJens Axboe struct io_buffer *nxt; 2503b77495aSJens Axboe 2513b77495aSJens Axboe nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); 2523b77495aSJens Axboe list_del(&nxt->list); 2533b77495aSJens Axboe if (++i == nbufs) 2543b77495aSJens Axboe return i; 2553b77495aSJens Axboe cond_resched(); 2563b77495aSJens Axboe } 2573b77495aSJens Axboe i++; 2583b77495aSJens Axboe 2593b77495aSJens Axboe return i; 2603b77495aSJens Axboe } 2613b77495aSJens Axboe 2623b77495aSJens Axboe void io_destroy_buffers(struct io_ring_ctx *ctx) 2633b77495aSJens Axboe { 2643b77495aSJens Axboe struct io_buffer_list *bl; 2653b77495aSJens Axboe unsigned long index; 2663b77495aSJens Axboe int i; 2673b77495aSJens Axboe 2683b77495aSJens Axboe for (i = 0; i < BGID_ARRAY; i++) { 2693b77495aSJens Axboe if (!ctx->io_bl) 2703b77495aSJens Axboe break; 2713b77495aSJens Axboe __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); 2723b77495aSJens Axboe } 2733b77495aSJens Axboe 2743b77495aSJens Axboe xa_for_each(&ctx->io_bl_xa, index, bl) { 2753b77495aSJens Axboe xa_erase(&ctx->io_bl_xa, bl->bgid); 2763b77495aSJens Axboe __io_remove_buffers(ctx, bl, -1U); 2773b77495aSJens Axboe kfree(bl); 2783b77495aSJens Axboe } 2793b77495aSJens Axboe 2803b77495aSJens Axboe while (!list_empty(&ctx->io_buffers_pages)) { 2813b77495aSJens Axboe struct page *page; 2823b77495aSJens Axboe 2833b77495aSJens Axboe page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); 2843b77495aSJens Axboe list_del_init(&page->lru); 2853b77495aSJens Axboe __free_page(page); 2863b77495aSJens Axboe } 2873b77495aSJens Axboe } 2883b77495aSJens Axboe 2893b77495aSJens Axboe int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2903b77495aSJens Axboe { 2913b77495aSJens Axboe struct io_provide_buf *p = io_kiocb_to_cmd(req); 2923b77495aSJens Axboe u64 tmp; 2933b77495aSJens Axboe 2943b77495aSJens Axboe if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 2953b77495aSJens Axboe sqe->splice_fd_in) 2963b77495aSJens Axboe return -EINVAL; 2973b77495aSJens Axboe 2983b77495aSJens Axboe tmp = READ_ONCE(sqe->fd); 2993b77495aSJens Axboe if (!tmp || tmp > USHRT_MAX) 3003b77495aSJens Axboe return -EINVAL; 3013b77495aSJens Axboe 3023b77495aSJens Axboe memset(p, 0, sizeof(*p)); 3033b77495aSJens Axboe p->nbufs = tmp; 3043b77495aSJens Axboe p->bgid = READ_ONCE(sqe->buf_group); 3053b77495aSJens Axboe return 0; 3063b77495aSJens Axboe } 3073b77495aSJens Axboe 3083b77495aSJens Axboe int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 3093b77495aSJens Axboe { 3103b77495aSJens Axboe struct io_provide_buf *p = io_kiocb_to_cmd(req); 3113b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 3123b77495aSJens Axboe struct io_buffer_list *bl; 3133b77495aSJens Axboe int ret = 0; 3143b77495aSJens Axboe 3153b77495aSJens Axboe io_ring_submit_lock(ctx, issue_flags); 3163b77495aSJens Axboe 3173b77495aSJens Axboe ret = -ENOENT; 3183b77495aSJens Axboe bl = io_buffer_get_list(ctx, p->bgid); 3193b77495aSJens Axboe if (bl) { 3203b77495aSJens Axboe ret = -EINVAL; 3213b77495aSJens Axboe /* can't use provide/remove buffers command on mapped buffers */ 3223b77495aSJens Axboe if (!bl->buf_nr_pages) 3233b77495aSJens Axboe ret = __io_remove_buffers(ctx, bl, p->nbufs); 3243b77495aSJens Axboe } 3253b77495aSJens Axboe if (ret < 0) 3263b77495aSJens Axboe req_set_fail(req); 3273b77495aSJens Axboe 3283b77495aSJens Axboe /* complete before unlock, IOPOLL may need the lock */ 3293b77495aSJens Axboe io_req_set_res(req, ret, 0); 3303b77495aSJens Axboe __io_req_complete(req, issue_flags); 3313b77495aSJens Axboe io_ring_submit_unlock(ctx, issue_flags); 3323b77495aSJens Axboe return IOU_ISSUE_SKIP_COMPLETE; 3333b77495aSJens Axboe } 3343b77495aSJens Axboe 3353b77495aSJens Axboe int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3363b77495aSJens Axboe { 3373b77495aSJens Axboe unsigned long size, tmp_check; 3383b77495aSJens Axboe struct io_provide_buf *p = io_kiocb_to_cmd(req); 3393b77495aSJens Axboe u64 tmp; 3403b77495aSJens Axboe 3413b77495aSJens Axboe if (sqe->rw_flags || sqe->splice_fd_in) 3423b77495aSJens Axboe return -EINVAL; 3433b77495aSJens Axboe 3443b77495aSJens Axboe tmp = READ_ONCE(sqe->fd); 3453b77495aSJens Axboe if (!tmp || tmp > USHRT_MAX) 3463b77495aSJens Axboe return -E2BIG; 3473b77495aSJens Axboe p->nbufs = tmp; 3483b77495aSJens Axboe p->addr = READ_ONCE(sqe->addr); 3493b77495aSJens Axboe p->len = READ_ONCE(sqe->len); 3503b77495aSJens Axboe 3513b77495aSJens Axboe if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 3523b77495aSJens Axboe &size)) 3533b77495aSJens Axboe return -EOVERFLOW; 3543b77495aSJens Axboe if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 3553b77495aSJens Axboe return -EOVERFLOW; 3563b77495aSJens Axboe 3573b77495aSJens Axboe size = (unsigned long)p->len * p->nbufs; 3583b77495aSJens Axboe if (!access_ok(u64_to_user_ptr(p->addr), size)) 3593b77495aSJens Axboe return -EFAULT; 3603b77495aSJens Axboe 3613b77495aSJens Axboe p->bgid = READ_ONCE(sqe->buf_group); 3623b77495aSJens Axboe tmp = READ_ONCE(sqe->off); 3633b77495aSJens Axboe if (tmp > USHRT_MAX) 3643b77495aSJens Axboe return -E2BIG; 3653b77495aSJens Axboe p->bid = tmp; 3663b77495aSJens Axboe return 0; 3673b77495aSJens Axboe } 3683b77495aSJens Axboe 3693b77495aSJens Axboe static int io_refill_buffer_cache(struct io_ring_ctx *ctx) 3703b77495aSJens Axboe { 3713b77495aSJens Axboe struct io_buffer *buf; 3723b77495aSJens Axboe struct page *page; 3733b77495aSJens Axboe int bufs_in_page; 3743b77495aSJens Axboe 3753b77495aSJens Axboe /* 3763b77495aSJens Axboe * Completions that don't happen inline (eg not under uring_lock) will 3773b77495aSJens Axboe * add to ->io_buffers_comp. If we don't have any free buffers, check 3783b77495aSJens Axboe * the completion list and splice those entries first. 3793b77495aSJens Axboe */ 3803b77495aSJens Axboe if (!list_empty_careful(&ctx->io_buffers_comp)) { 3813b77495aSJens Axboe spin_lock(&ctx->completion_lock); 3823b77495aSJens Axboe if (!list_empty(&ctx->io_buffers_comp)) { 3833b77495aSJens Axboe list_splice_init(&ctx->io_buffers_comp, 3843b77495aSJens Axboe &ctx->io_buffers_cache); 3853b77495aSJens Axboe spin_unlock(&ctx->completion_lock); 3863b77495aSJens Axboe return 0; 3873b77495aSJens Axboe } 3883b77495aSJens Axboe spin_unlock(&ctx->completion_lock); 3893b77495aSJens Axboe } 3903b77495aSJens Axboe 3913b77495aSJens Axboe /* 3923b77495aSJens Axboe * No free buffers and no completion entries either. Allocate a new 3933b77495aSJens Axboe * page worth of buffer entries and add those to our freelist. 3943b77495aSJens Axboe */ 3953b77495aSJens Axboe page = alloc_page(GFP_KERNEL_ACCOUNT); 3963b77495aSJens Axboe if (!page) 3973b77495aSJens Axboe return -ENOMEM; 3983b77495aSJens Axboe 3993b77495aSJens Axboe list_add(&page->lru, &ctx->io_buffers_pages); 4003b77495aSJens Axboe 4013b77495aSJens Axboe buf = page_address(page); 4023b77495aSJens Axboe bufs_in_page = PAGE_SIZE / sizeof(*buf); 4033b77495aSJens Axboe while (bufs_in_page) { 4043b77495aSJens Axboe list_add_tail(&buf->list, &ctx->io_buffers_cache); 4053b77495aSJens Axboe buf++; 4063b77495aSJens Axboe bufs_in_page--; 4073b77495aSJens Axboe } 4083b77495aSJens Axboe 4093b77495aSJens Axboe return 0; 4103b77495aSJens Axboe } 4113b77495aSJens Axboe 4123b77495aSJens Axboe static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, 4133b77495aSJens Axboe struct io_buffer_list *bl) 4143b77495aSJens Axboe { 4153b77495aSJens Axboe struct io_buffer *buf; 4163b77495aSJens Axboe u64 addr = pbuf->addr; 4173b77495aSJens Axboe int i, bid = pbuf->bid; 4183b77495aSJens Axboe 4193b77495aSJens Axboe for (i = 0; i < pbuf->nbufs; i++) { 4203b77495aSJens Axboe if (list_empty(&ctx->io_buffers_cache) && 4213b77495aSJens Axboe io_refill_buffer_cache(ctx)) 4223b77495aSJens Axboe break; 4233b77495aSJens Axboe buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, 4243b77495aSJens Axboe list); 4253b77495aSJens Axboe list_move_tail(&buf->list, &bl->buf_list); 4263b77495aSJens Axboe buf->addr = addr; 4273b77495aSJens Axboe buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 4283b77495aSJens Axboe buf->bid = bid; 4293b77495aSJens Axboe buf->bgid = pbuf->bgid; 4303b77495aSJens Axboe addr += pbuf->len; 4313b77495aSJens Axboe bid++; 4323b77495aSJens Axboe cond_resched(); 4333b77495aSJens Axboe } 4343b77495aSJens Axboe 4353b77495aSJens Axboe return i ? 0 : -ENOMEM; 4363b77495aSJens Axboe } 4373b77495aSJens Axboe 4383b77495aSJens Axboe int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 4393b77495aSJens Axboe { 4403b77495aSJens Axboe struct io_provide_buf *p = io_kiocb_to_cmd(req); 4413b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 4423b77495aSJens Axboe struct io_buffer_list *bl; 4433b77495aSJens Axboe int ret = 0; 4443b77495aSJens Axboe 4453b77495aSJens Axboe io_ring_submit_lock(ctx, issue_flags); 4463b77495aSJens Axboe 4473b77495aSJens Axboe if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { 4483b77495aSJens Axboe ret = io_init_bl_list(ctx); 4493b77495aSJens Axboe if (ret) 4503b77495aSJens Axboe goto err; 4513b77495aSJens Axboe } 4523b77495aSJens Axboe 4533b77495aSJens Axboe bl = io_buffer_get_list(ctx, p->bgid); 4543b77495aSJens Axboe if (unlikely(!bl)) { 4553b77495aSJens Axboe bl = kzalloc(sizeof(*bl), GFP_KERNEL); 4563b77495aSJens Axboe if (!bl) { 4573b77495aSJens Axboe ret = -ENOMEM; 4583b77495aSJens Axboe goto err; 4593b77495aSJens Axboe } 4603b77495aSJens Axboe INIT_LIST_HEAD(&bl->buf_list); 4613b77495aSJens Axboe ret = io_buffer_add_list(ctx, bl, p->bgid); 4623b77495aSJens Axboe if (ret) { 4633b77495aSJens Axboe kfree(bl); 4643b77495aSJens Axboe goto err; 4653b77495aSJens Axboe } 4663b77495aSJens Axboe } 4673b77495aSJens Axboe /* can't add buffers via this command for a mapped buffer ring */ 4683b77495aSJens Axboe if (bl->buf_nr_pages) { 4693b77495aSJens Axboe ret = -EINVAL; 4703b77495aSJens Axboe goto err; 4713b77495aSJens Axboe } 4723b77495aSJens Axboe 4733b77495aSJens Axboe ret = io_add_buffers(ctx, p, bl); 4743b77495aSJens Axboe err: 4753b77495aSJens Axboe if (ret < 0) 4763b77495aSJens Axboe req_set_fail(req); 4773b77495aSJens Axboe /* complete before unlock, IOPOLL may need the lock */ 4783b77495aSJens Axboe io_req_set_res(req, ret, 0); 4793b77495aSJens Axboe __io_req_complete(req, issue_flags); 4803b77495aSJens Axboe io_ring_submit_unlock(ctx, issue_flags); 4813b77495aSJens Axboe return IOU_ISSUE_SKIP_COMPLETE; 4823b77495aSJens Axboe } 4833b77495aSJens Axboe 4843b77495aSJens Axboe int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 4853b77495aSJens Axboe { 4863b77495aSJens Axboe struct io_uring_buf_ring *br; 4873b77495aSJens Axboe struct io_uring_buf_reg reg; 4883b77495aSJens Axboe struct io_buffer_list *bl, *free_bl = NULL; 4893b77495aSJens Axboe struct page **pages; 4903b77495aSJens Axboe int nr_pages; 4913b77495aSJens Axboe 4923b77495aSJens Axboe if (copy_from_user(®, arg, sizeof(reg))) 4933b77495aSJens Axboe return -EFAULT; 4943b77495aSJens Axboe 4953b77495aSJens Axboe if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) 4963b77495aSJens Axboe return -EINVAL; 4973b77495aSJens Axboe if (!reg.ring_addr) 4983b77495aSJens Axboe return -EFAULT; 4993b77495aSJens Axboe if (reg.ring_addr & ~PAGE_MASK) 5003b77495aSJens Axboe return -EINVAL; 5013b77495aSJens Axboe if (!is_power_of_2(reg.ring_entries)) 5023b77495aSJens Axboe return -EINVAL; 5033b77495aSJens Axboe 5043b77495aSJens Axboe /* cannot disambiguate full vs empty due to head/tail size */ 5053b77495aSJens Axboe if (reg.ring_entries >= 65536) 5063b77495aSJens Axboe return -EINVAL; 5073b77495aSJens Axboe 5083b77495aSJens Axboe if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { 5093b77495aSJens Axboe int ret = io_init_bl_list(ctx); 5103b77495aSJens Axboe if (ret) 5113b77495aSJens Axboe return ret; 5123b77495aSJens Axboe } 5133b77495aSJens Axboe 5143b77495aSJens Axboe bl = io_buffer_get_list(ctx, reg.bgid); 5153b77495aSJens Axboe if (bl) { 5163b77495aSJens Axboe /* if mapped buffer ring OR classic exists, don't allow */ 5173b77495aSJens Axboe if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) 5183b77495aSJens Axboe return -EEXIST; 5193b77495aSJens Axboe } else { 5203b77495aSJens Axboe free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); 5213b77495aSJens Axboe if (!bl) 5223b77495aSJens Axboe return -ENOMEM; 5233b77495aSJens Axboe } 5243b77495aSJens Axboe 5253b77495aSJens Axboe pages = io_pin_pages(reg.ring_addr, 5263b77495aSJens Axboe struct_size(br, bufs, reg.ring_entries), 5273b77495aSJens Axboe &nr_pages); 5283b77495aSJens Axboe if (IS_ERR(pages)) { 5293b77495aSJens Axboe kfree(free_bl); 5303b77495aSJens Axboe return PTR_ERR(pages); 5313b77495aSJens Axboe } 5323b77495aSJens Axboe 5333b77495aSJens Axboe br = page_address(pages[0]); 5343b77495aSJens Axboe bl->buf_pages = pages; 5353b77495aSJens Axboe bl->buf_nr_pages = nr_pages; 5363b77495aSJens Axboe bl->nr_entries = reg.ring_entries; 5373b77495aSJens Axboe bl->buf_ring = br; 5383b77495aSJens Axboe bl->mask = reg.ring_entries - 1; 5393b77495aSJens Axboe io_buffer_add_list(ctx, bl, reg.bgid); 5403b77495aSJens Axboe return 0; 5413b77495aSJens Axboe } 5423b77495aSJens Axboe 5433b77495aSJens Axboe int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 5443b77495aSJens Axboe { 5453b77495aSJens Axboe struct io_uring_buf_reg reg; 5463b77495aSJens Axboe struct io_buffer_list *bl; 5473b77495aSJens Axboe 5483b77495aSJens Axboe if (copy_from_user(®, arg, sizeof(reg))) 5493b77495aSJens Axboe return -EFAULT; 5503b77495aSJens Axboe if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) 5513b77495aSJens Axboe return -EINVAL; 5523b77495aSJens Axboe 5533b77495aSJens Axboe bl = io_buffer_get_list(ctx, reg.bgid); 5543b77495aSJens Axboe if (!bl) 5553b77495aSJens Axboe return -ENOENT; 5563b77495aSJens Axboe if (!bl->buf_nr_pages) 5573b77495aSJens Axboe return -EINVAL; 5583b77495aSJens Axboe 5593b77495aSJens Axboe __io_remove_buffers(ctx, bl, -1U); 5603b77495aSJens Axboe if (bl->bgid >= BGID_ARRAY) { 5613b77495aSJens Axboe xa_erase(&ctx->io_bl_xa, bl->bgid); 5623b77495aSJens Axboe kfree(bl); 5633b77495aSJens Axboe } 5643b77495aSJens Axboe return 0; 5653b77495aSJens Axboe } 566