13b77495aSJens Axboe // SPDX-License-Identifier: GPL-2.0 23b77495aSJens Axboe #include <linux/kernel.h> 33b77495aSJens Axboe #include <linux/errno.h> 43b77495aSJens Axboe #include <linux/fs.h> 53b77495aSJens Axboe #include <linux/file.h> 63b77495aSJens Axboe #include <linux/mm.h> 73b77495aSJens Axboe #include <linux/slab.h> 83b77495aSJens Axboe #include <linux/namei.h> 93b77495aSJens Axboe #include <linux/poll.h> 103b77495aSJens Axboe #include <linux/io_uring.h> 113b77495aSJens Axboe 123b77495aSJens Axboe #include <uapi/linux/io_uring.h> 133b77495aSJens Axboe 143b77495aSJens Axboe #include "io_uring.h" 153b77495aSJens Axboe #include "opdef.h" 163b77495aSJens Axboe #include "kbuf.h" 173b77495aSJens Axboe 183b77495aSJens Axboe #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) 193b77495aSJens Axboe 203b77495aSJens Axboe #define BGID_ARRAY 64 213b77495aSJens Axboe 223b77495aSJens Axboe struct io_provide_buf { 233b77495aSJens Axboe struct file *file; 243b77495aSJens Axboe __u64 addr; 253b77495aSJens Axboe __u32 len; 263b77495aSJens Axboe __u32 bgid; 273b77495aSJens Axboe __u16 nbufs; 283b77495aSJens Axboe __u16 bid; 293b77495aSJens Axboe }; 303b77495aSJens Axboe 313b77495aSJens Axboe static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 323b77495aSJens Axboe unsigned int bgid) 333b77495aSJens Axboe { 343b77495aSJens Axboe if (ctx->io_bl && bgid < BGID_ARRAY) 353b77495aSJens Axboe return &ctx->io_bl[bgid]; 363b77495aSJens Axboe 373b77495aSJens Axboe return xa_load(&ctx->io_bl_xa, bgid); 383b77495aSJens Axboe } 393b77495aSJens Axboe 40024b8fdeSHao Xu static int io_buffer_add_list(struct io_ring_ctx *ctx, 41024b8fdeSHao Xu struct io_buffer_list *bl, unsigned int bgid) 42024b8fdeSHao Xu { 43024b8fdeSHao Xu bl->bgid = bgid; 44024b8fdeSHao Xu if (bgid < BGID_ARRAY) 45024b8fdeSHao Xu return 0; 46024b8fdeSHao Xu 47024b8fdeSHao Xu return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 48024b8fdeSHao Xu } 49024b8fdeSHao Xu 50024b8fdeSHao Xu void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) 513b77495aSJens Axboe { 523b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 533b77495aSJens Axboe struct io_buffer_list *bl; 543b77495aSJens Axboe struct io_buffer *buf; 553b77495aSJens Axboe 563b77495aSJens Axboe /* 57024b8fdeSHao Xu * For legacy provided buffer mode, don't recycle if we already did 58024b8fdeSHao Xu * IO to this buffer. For ring-mapped provided buffer mode, we should 59024b8fdeSHao Xu * increment ring->head to explicitly monopolize the buffer to avoid 60024b8fdeSHao Xu * multiple use. 61024b8fdeSHao Xu */ 62024b8fdeSHao Xu if (req->flags & REQ_F_PARTIAL_IO) 63024b8fdeSHao Xu return; 64024b8fdeSHao Xu 65024b8fdeSHao Xu io_ring_submit_lock(ctx, issue_flags); 66024b8fdeSHao Xu 67024b8fdeSHao Xu buf = req->kbuf; 68024b8fdeSHao Xu bl = io_buffer_get_list(ctx, buf->bgid); 69024b8fdeSHao Xu list_add(&buf->list, &bl->buf_list); 70024b8fdeSHao Xu req->flags &= ~REQ_F_BUFFER_SELECTED; 71024b8fdeSHao Xu req->buf_index = buf->bgid; 72024b8fdeSHao Xu 73024b8fdeSHao Xu io_ring_submit_unlock(ctx, issue_flags); 74024b8fdeSHao Xu return; 75024b8fdeSHao Xu } 76024b8fdeSHao Xu 7753ccf69bSPavel Begunkov unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) 7853ccf69bSPavel Begunkov { 7953ccf69bSPavel Begunkov unsigned int cflags; 8053ccf69bSPavel Begunkov 8153ccf69bSPavel Begunkov /* 8253ccf69bSPavel Begunkov * We can add this buffer back to two lists: 8353ccf69bSPavel Begunkov * 8453ccf69bSPavel Begunkov * 1) The io_buffers_cache list. This one is protected by the 8553ccf69bSPavel Begunkov * ctx->uring_lock. If we already hold this lock, add back to this 8653ccf69bSPavel Begunkov * list as we can grab it from issue as well. 8753ccf69bSPavel Begunkov * 2) The io_buffers_comp list. This one is protected by the 8853ccf69bSPavel Begunkov * ctx->completion_lock. 8953ccf69bSPavel Begunkov * 9053ccf69bSPavel Begunkov * We migrate buffers from the comp_list to the issue cache list 9153ccf69bSPavel Begunkov * when we need one. 9253ccf69bSPavel Begunkov */ 9353ccf69bSPavel Begunkov if (req->flags & REQ_F_BUFFER_RING) { 9453ccf69bSPavel Begunkov /* no buffers to recycle for this case */ 9553ccf69bSPavel Begunkov cflags = __io_put_kbuf_list(req, NULL); 9653ccf69bSPavel Begunkov } else if (issue_flags & IO_URING_F_UNLOCKED) { 9753ccf69bSPavel Begunkov struct io_ring_ctx *ctx = req->ctx; 9853ccf69bSPavel Begunkov 9953ccf69bSPavel Begunkov spin_lock(&ctx->completion_lock); 10053ccf69bSPavel Begunkov cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); 10153ccf69bSPavel Begunkov spin_unlock(&ctx->completion_lock); 10253ccf69bSPavel Begunkov } else { 10353ccf69bSPavel Begunkov lockdep_assert_held(&req->ctx->uring_lock); 10453ccf69bSPavel Begunkov 10553ccf69bSPavel Begunkov cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); 10653ccf69bSPavel Begunkov } 10753ccf69bSPavel Begunkov return cflags; 10853ccf69bSPavel Begunkov } 10953ccf69bSPavel Begunkov 1103b77495aSJens Axboe static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, 1113b77495aSJens Axboe struct io_buffer_list *bl) 1123b77495aSJens Axboe { 1133b77495aSJens Axboe if (!list_empty(&bl->buf_list)) { 1143b77495aSJens Axboe struct io_buffer *kbuf; 1153b77495aSJens Axboe 1163b77495aSJens Axboe kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); 1173b77495aSJens Axboe list_del(&kbuf->list); 118b8c01559SDylan Yudaken if (*len == 0 || *len > kbuf->len) 1193b77495aSJens Axboe *len = kbuf->len; 1203b77495aSJens Axboe req->flags |= REQ_F_BUFFER_SELECTED; 1213b77495aSJens Axboe req->kbuf = kbuf; 1223b77495aSJens Axboe req->buf_index = kbuf->bid; 1233b77495aSJens Axboe return u64_to_user_ptr(kbuf->addr); 1243b77495aSJens Axboe } 1253b77495aSJens Axboe return NULL; 1263b77495aSJens Axboe } 1273b77495aSJens Axboe 1283b77495aSJens Axboe static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 1293b77495aSJens Axboe struct io_buffer_list *bl, 1303b77495aSJens Axboe unsigned int issue_flags) 1313b77495aSJens Axboe { 1323b77495aSJens Axboe struct io_uring_buf_ring *br = bl->buf_ring; 1333b77495aSJens Axboe struct io_uring_buf *buf; 1343b77495aSJens Axboe __u16 head = bl->head; 1353b77495aSJens Axboe 1363b77495aSJens Axboe if (unlikely(smp_load_acquire(&br->tail) == head)) 1373b77495aSJens Axboe return NULL; 1383b77495aSJens Axboe 1393b77495aSJens Axboe head &= bl->mask; 140c56e022cSJens Axboe /* mmaped buffers are always contig */ 141c56e022cSJens Axboe if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { 1423b77495aSJens Axboe buf = &br->bufs[head]; 1433b77495aSJens Axboe } else { 1443b77495aSJens Axboe int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); 1453b77495aSJens Axboe int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; 1463b77495aSJens Axboe buf = page_address(bl->buf_pages[index]); 1473b77495aSJens Axboe buf += off; 1483b77495aSJens Axboe } 149b8c01559SDylan Yudaken if (*len == 0 || *len > buf->len) 1503b77495aSJens Axboe *len = buf->len; 1513b77495aSJens Axboe req->flags |= REQ_F_BUFFER_RING; 1523b77495aSJens Axboe req->buf_list = bl; 1533b77495aSJens Axboe req->buf_index = buf->bid; 1543b77495aSJens Axboe 1553b77495aSJens Axboe if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { 1563b77495aSJens Axboe /* 1573b77495aSJens Axboe * If we came in unlocked, we have no choice but to consume the 158f09c8643SHao Xu * buffer here, otherwise nothing ensures that the buffer won't 159f09c8643SHao Xu * get used by others. This does mean it'll be pinned until the 160f09c8643SHao Xu * IO completes, coming in unlocked means we're being called from 161f09c8643SHao Xu * io-wq context and there may be further retries in async hybrid 162f09c8643SHao Xu * mode. For the locked case, the caller must call commit when 163f09c8643SHao Xu * the transfer completes (or if we get -EAGAIN and must poll of 164f09c8643SHao Xu * retry). 1653b77495aSJens Axboe */ 1663b77495aSJens Axboe req->buf_list = NULL; 1673b77495aSJens Axboe bl->head++; 1683b77495aSJens Axboe } 1693b77495aSJens Axboe return u64_to_user_ptr(buf->addr); 1703b77495aSJens Axboe } 1713b77495aSJens Axboe 1723b77495aSJens Axboe void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 1733b77495aSJens Axboe unsigned int issue_flags) 1743b77495aSJens Axboe { 1753b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 1763b77495aSJens Axboe struct io_buffer_list *bl; 1773b77495aSJens Axboe void __user *ret = NULL; 1783b77495aSJens Axboe 1793b77495aSJens Axboe io_ring_submit_lock(req->ctx, issue_flags); 1803b77495aSJens Axboe 1813b77495aSJens Axboe bl = io_buffer_get_list(ctx, req->buf_index); 1823b77495aSJens Axboe if (likely(bl)) { 18325a2c188SJens Axboe if (bl->is_mapped) 1843b77495aSJens Axboe ret = io_ring_buffer_select(req, len, bl, issue_flags); 1853b77495aSJens Axboe else 1863b77495aSJens Axboe ret = io_provided_buffer_select(req, len, bl); 1873b77495aSJens Axboe } 1883b77495aSJens Axboe io_ring_submit_unlock(req->ctx, issue_flags); 1893b77495aSJens Axboe return ret; 1903b77495aSJens Axboe } 1913b77495aSJens Axboe 1923b77495aSJens Axboe static __cold int io_init_bl_list(struct io_ring_ctx *ctx) 1933b77495aSJens Axboe { 1943b77495aSJens Axboe int i; 1953b77495aSJens Axboe 1963b77495aSJens Axboe ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), 1973b77495aSJens Axboe GFP_KERNEL); 1983b77495aSJens Axboe if (!ctx->io_bl) 1993b77495aSJens Axboe return -ENOMEM; 2003b77495aSJens Axboe 2013b77495aSJens Axboe for (i = 0; i < BGID_ARRAY; i++) { 2023b77495aSJens Axboe INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); 2033b77495aSJens Axboe ctx->io_bl[i].bgid = i; 2043b77495aSJens Axboe } 2053b77495aSJens Axboe 2063b77495aSJens Axboe return 0; 2073b77495aSJens Axboe } 2083b77495aSJens Axboe 2093b77495aSJens Axboe static int __io_remove_buffers(struct io_ring_ctx *ctx, 2103b77495aSJens Axboe struct io_buffer_list *bl, unsigned nbufs) 2113b77495aSJens Axboe { 2123b77495aSJens Axboe unsigned i = 0; 2133b77495aSJens Axboe 2143b77495aSJens Axboe /* shouldn't happen */ 2153b77495aSJens Axboe if (!nbufs) 2163b77495aSJens Axboe return 0; 2173b77495aSJens Axboe 218c56e022cSJens Axboe if (bl->is_mapped) { 219c56e022cSJens Axboe i = bl->buf_ring->tail - bl->head; 220c56e022cSJens Axboe if (bl->is_mmap) { 221c56e022cSJens Axboe if (bl->buf_ring) { 222c56e022cSJens Axboe struct page *page; 223c56e022cSJens Axboe 224c56e022cSJens Axboe page = virt_to_head_page(bl->buf_ring); 225c56e022cSJens Axboe if (put_page_testzero(page)) 226c56e022cSJens Axboe free_compound_page(page); 227c56e022cSJens Axboe bl->buf_ring = NULL; 228c56e022cSJens Axboe } 229c56e022cSJens Axboe bl->is_mmap = 0; 230c56e022cSJens Axboe } else if (bl->buf_nr_pages) { 2313b77495aSJens Axboe int j; 2323b77495aSJens Axboe 2333b77495aSJens Axboe for (j = 0; j < bl->buf_nr_pages; j++) 2343b77495aSJens Axboe unpin_user_page(bl->buf_pages[j]); 2353b77495aSJens Axboe kvfree(bl->buf_pages); 2363b77495aSJens Axboe bl->buf_pages = NULL; 2373b77495aSJens Axboe bl->buf_nr_pages = 0; 238c56e022cSJens Axboe } 2393b77495aSJens Axboe /* make sure it's seen as empty */ 2403b77495aSJens Axboe INIT_LIST_HEAD(&bl->buf_list); 24125a2c188SJens Axboe bl->is_mapped = 0; 2423b77495aSJens Axboe return i; 2433b77495aSJens Axboe } 2443b77495aSJens Axboe 2453b77495aSJens Axboe /* the head kbuf is the list itself */ 2463b77495aSJens Axboe while (!list_empty(&bl->buf_list)) { 2473b77495aSJens Axboe struct io_buffer *nxt; 2483b77495aSJens Axboe 2493b77495aSJens Axboe nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); 2503b77495aSJens Axboe list_del(&nxt->list); 2513b77495aSJens Axboe if (++i == nbufs) 2523b77495aSJens Axboe return i; 2533b77495aSJens Axboe cond_resched(); 2543b77495aSJens Axboe } 2553b77495aSJens Axboe i++; 2563b77495aSJens Axboe 2573b77495aSJens Axboe return i; 2583b77495aSJens Axboe } 2593b77495aSJens Axboe 2603b77495aSJens Axboe void io_destroy_buffers(struct io_ring_ctx *ctx) 2613b77495aSJens Axboe { 2623b77495aSJens Axboe struct io_buffer_list *bl; 2633b77495aSJens Axboe unsigned long index; 2643b77495aSJens Axboe int i; 2653b77495aSJens Axboe 2663b77495aSJens Axboe for (i = 0; i < BGID_ARRAY; i++) { 2673b77495aSJens Axboe if (!ctx->io_bl) 2683b77495aSJens Axboe break; 2693b77495aSJens Axboe __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); 2703b77495aSJens Axboe } 2713b77495aSJens Axboe 2723b77495aSJens Axboe xa_for_each(&ctx->io_bl_xa, index, bl) { 2733b77495aSJens Axboe xa_erase(&ctx->io_bl_xa, bl->bgid); 2743b77495aSJens Axboe __io_remove_buffers(ctx, bl, -1U); 2753b77495aSJens Axboe kfree(bl); 2763b77495aSJens Axboe } 2773b77495aSJens Axboe 2783b77495aSJens Axboe while (!list_empty(&ctx->io_buffers_pages)) { 2793b77495aSJens Axboe struct page *page; 2803b77495aSJens Axboe 2813b77495aSJens Axboe page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); 2823b77495aSJens Axboe list_del_init(&page->lru); 2833b77495aSJens Axboe __free_page(page); 2843b77495aSJens Axboe } 2853b77495aSJens Axboe } 2863b77495aSJens Axboe 2873b77495aSJens Axboe int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2883b77495aSJens Axboe { 289f2ccb5aeSStefan Metzmacher struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 2903b77495aSJens Axboe u64 tmp; 2913b77495aSJens Axboe 2923b77495aSJens Axboe if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 2933b77495aSJens Axboe sqe->splice_fd_in) 2943b77495aSJens Axboe return -EINVAL; 2953b77495aSJens Axboe 2963b77495aSJens Axboe tmp = READ_ONCE(sqe->fd); 2973b77495aSJens Axboe if (!tmp || tmp > USHRT_MAX) 2983b77495aSJens Axboe return -EINVAL; 2993b77495aSJens Axboe 3003b77495aSJens Axboe memset(p, 0, sizeof(*p)); 3013b77495aSJens Axboe p->nbufs = tmp; 3023b77495aSJens Axboe p->bgid = READ_ONCE(sqe->buf_group); 3033b77495aSJens Axboe return 0; 3043b77495aSJens Axboe } 3053b77495aSJens Axboe 3063b77495aSJens Axboe int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 3073b77495aSJens Axboe { 308f2ccb5aeSStefan Metzmacher struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 3093b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 3103b77495aSJens Axboe struct io_buffer_list *bl; 3113b77495aSJens Axboe int ret = 0; 3123b77495aSJens Axboe 3133b77495aSJens Axboe io_ring_submit_lock(ctx, issue_flags); 3143b77495aSJens Axboe 3153b77495aSJens Axboe ret = -ENOENT; 3163b77495aSJens Axboe bl = io_buffer_get_list(ctx, p->bgid); 3173b77495aSJens Axboe if (bl) { 3183b77495aSJens Axboe ret = -EINVAL; 3193b77495aSJens Axboe /* can't use provide/remove buffers command on mapped buffers */ 32025a2c188SJens Axboe if (!bl->is_mapped) 3213b77495aSJens Axboe ret = __io_remove_buffers(ctx, bl, p->nbufs); 3223b77495aSJens Axboe } 323c3b49093SPavel Begunkov io_ring_submit_unlock(ctx, issue_flags); 3243b77495aSJens Axboe if (ret < 0) 3253b77495aSJens Axboe req_set_fail(req); 3263b77495aSJens Axboe io_req_set_res(req, ret, 0); 327c3b49093SPavel Begunkov return IOU_OK; 3283b77495aSJens Axboe } 3293b77495aSJens Axboe 3303b77495aSJens Axboe int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3313b77495aSJens Axboe { 3323b77495aSJens Axboe unsigned long size, tmp_check; 333f2ccb5aeSStefan Metzmacher struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 3343b77495aSJens Axboe u64 tmp; 3353b77495aSJens Axboe 3363b77495aSJens Axboe if (sqe->rw_flags || sqe->splice_fd_in) 3373b77495aSJens Axboe return -EINVAL; 3383b77495aSJens Axboe 3393b77495aSJens Axboe tmp = READ_ONCE(sqe->fd); 3403b77495aSJens Axboe if (!tmp || tmp > USHRT_MAX) 3413b77495aSJens Axboe return -E2BIG; 3423b77495aSJens Axboe p->nbufs = tmp; 3433b77495aSJens Axboe p->addr = READ_ONCE(sqe->addr); 3443b77495aSJens Axboe p->len = READ_ONCE(sqe->len); 3453b77495aSJens Axboe 3463b77495aSJens Axboe if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 3473b77495aSJens Axboe &size)) 3483b77495aSJens Axboe return -EOVERFLOW; 3493b77495aSJens Axboe if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 3503b77495aSJens Axboe return -EOVERFLOW; 3513b77495aSJens Axboe 3523b77495aSJens Axboe size = (unsigned long)p->len * p->nbufs; 3533b77495aSJens Axboe if (!access_ok(u64_to_user_ptr(p->addr), size)) 3543b77495aSJens Axboe return -EFAULT; 3553b77495aSJens Axboe 3563b77495aSJens Axboe p->bgid = READ_ONCE(sqe->buf_group); 3573b77495aSJens Axboe tmp = READ_ONCE(sqe->off); 3583b77495aSJens Axboe if (tmp > USHRT_MAX) 3593b77495aSJens Axboe return -E2BIG; 3603851d25cSJens Axboe if (tmp + p->nbufs >= USHRT_MAX) 3613851d25cSJens Axboe return -EINVAL; 3623b77495aSJens Axboe p->bid = tmp; 3633b77495aSJens Axboe return 0; 3643b77495aSJens Axboe } 3653b77495aSJens Axboe 3663b77495aSJens Axboe static int io_refill_buffer_cache(struct io_ring_ctx *ctx) 3673b77495aSJens Axboe { 3683b77495aSJens Axboe struct io_buffer *buf; 3693b77495aSJens Axboe struct page *page; 3703b77495aSJens Axboe int bufs_in_page; 3713b77495aSJens Axboe 3723b77495aSJens Axboe /* 3733b77495aSJens Axboe * Completions that don't happen inline (eg not under uring_lock) will 3743b77495aSJens Axboe * add to ->io_buffers_comp. If we don't have any free buffers, check 3753b77495aSJens Axboe * the completion list and splice those entries first. 3763b77495aSJens Axboe */ 3773b77495aSJens Axboe if (!list_empty_careful(&ctx->io_buffers_comp)) { 3783b77495aSJens Axboe spin_lock(&ctx->completion_lock); 3793b77495aSJens Axboe if (!list_empty(&ctx->io_buffers_comp)) { 3803b77495aSJens Axboe list_splice_init(&ctx->io_buffers_comp, 3813b77495aSJens Axboe &ctx->io_buffers_cache); 3823b77495aSJens Axboe spin_unlock(&ctx->completion_lock); 3833b77495aSJens Axboe return 0; 3843b77495aSJens Axboe } 3853b77495aSJens Axboe spin_unlock(&ctx->completion_lock); 3863b77495aSJens Axboe } 3873b77495aSJens Axboe 3883b77495aSJens Axboe /* 3893b77495aSJens Axboe * No free buffers and no completion entries either. Allocate a new 3903b77495aSJens Axboe * page worth of buffer entries and add those to our freelist. 3913b77495aSJens Axboe */ 3923b77495aSJens Axboe page = alloc_page(GFP_KERNEL_ACCOUNT); 3933b77495aSJens Axboe if (!page) 3943b77495aSJens Axboe return -ENOMEM; 3953b77495aSJens Axboe 3963b77495aSJens Axboe list_add(&page->lru, &ctx->io_buffers_pages); 3973b77495aSJens Axboe 3983b77495aSJens Axboe buf = page_address(page); 3993b77495aSJens Axboe bufs_in_page = PAGE_SIZE / sizeof(*buf); 4003b77495aSJens Axboe while (bufs_in_page) { 4013b77495aSJens Axboe list_add_tail(&buf->list, &ctx->io_buffers_cache); 4023b77495aSJens Axboe buf++; 4033b77495aSJens Axboe bufs_in_page--; 4043b77495aSJens Axboe } 4053b77495aSJens Axboe 4063b77495aSJens Axboe return 0; 4073b77495aSJens Axboe } 4083b77495aSJens Axboe 4093b77495aSJens Axboe static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, 4103b77495aSJens Axboe struct io_buffer_list *bl) 4113b77495aSJens Axboe { 4123b77495aSJens Axboe struct io_buffer *buf; 4133b77495aSJens Axboe u64 addr = pbuf->addr; 4143b77495aSJens Axboe int i, bid = pbuf->bid; 4153b77495aSJens Axboe 4163b77495aSJens Axboe for (i = 0; i < pbuf->nbufs; i++) { 4173b77495aSJens Axboe if (list_empty(&ctx->io_buffers_cache) && 4183b77495aSJens Axboe io_refill_buffer_cache(ctx)) 4193b77495aSJens Axboe break; 4203b77495aSJens Axboe buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, 4213b77495aSJens Axboe list); 4223b77495aSJens Axboe list_move_tail(&buf->list, &bl->buf_list); 4233b77495aSJens Axboe buf->addr = addr; 4243b77495aSJens Axboe buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 4253b77495aSJens Axboe buf->bid = bid; 4263b77495aSJens Axboe buf->bgid = pbuf->bgid; 4273b77495aSJens Axboe addr += pbuf->len; 4283b77495aSJens Axboe bid++; 4293b77495aSJens Axboe cond_resched(); 4303b77495aSJens Axboe } 4313b77495aSJens Axboe 4323b77495aSJens Axboe return i ? 0 : -ENOMEM; 4333b77495aSJens Axboe } 4343b77495aSJens Axboe 4353b77495aSJens Axboe int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 4363b77495aSJens Axboe { 437f2ccb5aeSStefan Metzmacher struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 4383b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 4393b77495aSJens Axboe struct io_buffer_list *bl; 4403b77495aSJens Axboe int ret = 0; 4413b77495aSJens Axboe 4423b77495aSJens Axboe io_ring_submit_lock(ctx, issue_flags); 4433b77495aSJens Axboe 4443b77495aSJens Axboe if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { 4453b77495aSJens Axboe ret = io_init_bl_list(ctx); 4463b77495aSJens Axboe if (ret) 4473b77495aSJens Axboe goto err; 4483b77495aSJens Axboe } 4493b77495aSJens Axboe 4503b77495aSJens Axboe bl = io_buffer_get_list(ctx, p->bgid); 4513b77495aSJens Axboe if (unlikely(!bl)) { 452cc18cc5eSPavel Begunkov bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); 4533b77495aSJens Axboe if (!bl) { 4543b77495aSJens Axboe ret = -ENOMEM; 4553b77495aSJens Axboe goto err; 4563b77495aSJens Axboe } 4573b77495aSJens Axboe INIT_LIST_HEAD(&bl->buf_list); 4583b77495aSJens Axboe ret = io_buffer_add_list(ctx, bl, p->bgid); 4593b77495aSJens Axboe if (ret) { 4603b77495aSJens Axboe kfree(bl); 4613b77495aSJens Axboe goto err; 4623b77495aSJens Axboe } 4633b77495aSJens Axboe } 4643b77495aSJens Axboe /* can't add buffers via this command for a mapped buffer ring */ 46525a2c188SJens Axboe if (bl->is_mapped) { 4663b77495aSJens Axboe ret = -EINVAL; 4673b77495aSJens Axboe goto err; 4683b77495aSJens Axboe } 4693b77495aSJens Axboe 4703b77495aSJens Axboe ret = io_add_buffers(ctx, p, bl); 4713b77495aSJens Axboe err: 472c3b49093SPavel Begunkov io_ring_submit_unlock(ctx, issue_flags); 473c3b49093SPavel Begunkov 4743b77495aSJens Axboe if (ret < 0) 4753b77495aSJens Axboe req_set_fail(req); 4763b77495aSJens Axboe io_req_set_res(req, ret, 0); 477c3b49093SPavel Begunkov return IOU_OK; 4783b77495aSJens Axboe } 4793b77495aSJens Axboe 480ba56b632SJens Axboe static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, 481ba56b632SJens Axboe struct io_buffer_list *bl) 4823b77495aSJens Axboe { 4833b77495aSJens Axboe struct io_uring_buf_ring *br; 4843b77495aSJens Axboe struct page **pages; 4853b77495aSJens Axboe int nr_pages; 4863b77495aSJens Axboe 487ba56b632SJens Axboe pages = io_pin_pages(reg->ring_addr, 488ba56b632SJens Axboe flex_array_size(br, bufs, reg->ring_entries), 489ba56b632SJens Axboe &nr_pages); 490ba56b632SJens Axboe if (IS_ERR(pages)) 491ba56b632SJens Axboe return PTR_ERR(pages); 492ba56b632SJens Axboe 493ba56b632SJens Axboe br = page_address(pages[0]); 494*fcb46c0cSJens Axboe #ifdef SHM_COLOUR 495*fcb46c0cSJens Axboe /* 496*fcb46c0cSJens Axboe * On platforms that have specific aliasing requirements, SHM_COLOUR 497*fcb46c0cSJens Axboe * is set and we must guarantee that the kernel and user side align 498*fcb46c0cSJens Axboe * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and 499*fcb46c0cSJens Axboe * the application mmap's the provided ring buffer. Fail the request 500*fcb46c0cSJens Axboe * if we, by chance, don't end up with aligned addresses. The app 501*fcb46c0cSJens Axboe * should use IOU_PBUF_RING_MMAP instead, and liburing will handle 502*fcb46c0cSJens Axboe * this transparently. 503*fcb46c0cSJens Axboe */ 504*fcb46c0cSJens Axboe if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { 505*fcb46c0cSJens Axboe int i; 506*fcb46c0cSJens Axboe 507*fcb46c0cSJens Axboe for (i = 0; i < nr_pages; i++) 508*fcb46c0cSJens Axboe unpin_user_page(pages[i]); 509*fcb46c0cSJens Axboe return -EINVAL; 510*fcb46c0cSJens Axboe } 511*fcb46c0cSJens Axboe #endif 512ba56b632SJens Axboe bl->buf_pages = pages; 513ba56b632SJens Axboe bl->buf_nr_pages = nr_pages; 514ba56b632SJens Axboe bl->buf_ring = br; 51525a2c188SJens Axboe bl->is_mapped = 1; 516c56e022cSJens Axboe bl->is_mmap = 0; 517c56e022cSJens Axboe return 0; 518c56e022cSJens Axboe } 519c56e022cSJens Axboe 520c56e022cSJens Axboe static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, 521c56e022cSJens Axboe struct io_buffer_list *bl) 522c56e022cSJens Axboe { 523c56e022cSJens Axboe gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; 524c56e022cSJens Axboe size_t ring_size; 525c56e022cSJens Axboe void *ptr; 526c56e022cSJens Axboe 527c56e022cSJens Axboe ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); 528c56e022cSJens Axboe ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); 529c56e022cSJens Axboe if (!ptr) 530c56e022cSJens Axboe return -ENOMEM; 531c56e022cSJens Axboe 532c56e022cSJens Axboe bl->buf_ring = ptr; 533c56e022cSJens Axboe bl->is_mapped = 1; 534c56e022cSJens Axboe bl->is_mmap = 1; 535ba56b632SJens Axboe return 0; 536ba56b632SJens Axboe } 537ba56b632SJens Axboe 538ba56b632SJens Axboe int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 539ba56b632SJens Axboe { 540ba56b632SJens Axboe struct io_uring_buf_reg reg; 541ba56b632SJens Axboe struct io_buffer_list *bl, *free_bl = NULL; 542ba56b632SJens Axboe int ret; 543ba56b632SJens Axboe 5443b77495aSJens Axboe if (copy_from_user(®, arg, sizeof(reg))) 5453b77495aSJens Axboe return -EFAULT; 5463b77495aSJens Axboe 54781cf17cdSJens Axboe if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 54881cf17cdSJens Axboe return -EINVAL; 549c56e022cSJens Axboe if (reg.flags & ~IOU_PBUF_RING_MMAP) 5503b77495aSJens Axboe return -EINVAL; 551c56e022cSJens Axboe if (!(reg.flags & IOU_PBUF_RING_MMAP)) { 5523b77495aSJens Axboe if (!reg.ring_addr) 5533b77495aSJens Axboe return -EFAULT; 5543b77495aSJens Axboe if (reg.ring_addr & ~PAGE_MASK) 5553b77495aSJens Axboe return -EINVAL; 556c56e022cSJens Axboe } else { 557c56e022cSJens Axboe if (reg.ring_addr) 558c56e022cSJens Axboe return -EINVAL; 559c56e022cSJens Axboe } 560c56e022cSJens Axboe 5613b77495aSJens Axboe if (!is_power_of_2(reg.ring_entries)) 5623b77495aSJens Axboe return -EINVAL; 5633b77495aSJens Axboe 5643b77495aSJens Axboe /* cannot disambiguate full vs empty due to head/tail size */ 5653b77495aSJens Axboe if (reg.ring_entries >= 65536) 5663b77495aSJens Axboe return -EINVAL; 5673b77495aSJens Axboe 5683b77495aSJens Axboe if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { 5693b77495aSJens Axboe int ret = io_init_bl_list(ctx); 5703b77495aSJens Axboe if (ret) 5713b77495aSJens Axboe return ret; 5723b77495aSJens Axboe } 5733b77495aSJens Axboe 5743b77495aSJens Axboe bl = io_buffer_get_list(ctx, reg.bgid); 5753b77495aSJens Axboe if (bl) { 5763b77495aSJens Axboe /* if mapped buffer ring OR classic exists, don't allow */ 57725a2c188SJens Axboe if (bl->is_mapped || !list_empty(&bl->buf_list)) 5783b77495aSJens Axboe return -EEXIST; 5793b77495aSJens Axboe } else { 5803b77495aSJens Axboe free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); 5813b77495aSJens Axboe if (!bl) 5823b77495aSJens Axboe return -ENOMEM; 5833b77495aSJens Axboe } 5843b77495aSJens Axboe 585c56e022cSJens Axboe if (!(reg.flags & IOU_PBUF_RING_MMAP)) 586ba56b632SJens Axboe ret = io_pin_pbuf_ring(®, bl); 587c56e022cSJens Axboe else 588c56e022cSJens Axboe ret = io_alloc_pbuf_ring(®, bl); 5893b77495aSJens Axboe 590c56e022cSJens Axboe if (!ret) { 5913b77495aSJens Axboe bl->nr_entries = reg.ring_entries; 5923b77495aSJens Axboe bl->mask = reg.ring_entries - 1; 593ba56b632SJens Axboe 5943b77495aSJens Axboe io_buffer_add_list(ctx, bl, reg.bgid); 5953b77495aSJens Axboe return 0; 5963b77495aSJens Axboe } 5973b77495aSJens Axboe 598c56e022cSJens Axboe kfree(free_bl); 599c56e022cSJens Axboe return ret; 600c56e022cSJens Axboe } 601c56e022cSJens Axboe 6023b77495aSJens Axboe int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 6033b77495aSJens Axboe { 6043b77495aSJens Axboe struct io_uring_buf_reg reg; 6053b77495aSJens Axboe struct io_buffer_list *bl; 6063b77495aSJens Axboe 6073b77495aSJens Axboe if (copy_from_user(®, arg, sizeof(reg))) 6083b77495aSJens Axboe return -EFAULT; 60981cf17cdSJens Axboe if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 61081cf17cdSJens Axboe return -EINVAL; 61181cf17cdSJens Axboe if (reg.flags) 6123b77495aSJens Axboe return -EINVAL; 6133b77495aSJens Axboe 6143b77495aSJens Axboe bl = io_buffer_get_list(ctx, reg.bgid); 6153b77495aSJens Axboe if (!bl) 6163b77495aSJens Axboe return -ENOENT; 61725a2c188SJens Axboe if (!bl->is_mapped) 6183b77495aSJens Axboe return -EINVAL; 6193b77495aSJens Axboe 6203b77495aSJens Axboe __io_remove_buffers(ctx, bl, -1U); 6213b77495aSJens Axboe if (bl->bgid >= BGID_ARRAY) { 6223b77495aSJens Axboe xa_erase(&ctx->io_bl_xa, bl->bgid); 6233b77495aSJens Axboe kfree(bl); 6243b77495aSJens Axboe } 6253b77495aSJens Axboe return 0; 6263b77495aSJens Axboe } 627c56e022cSJens Axboe 628c56e022cSJens Axboe void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) 629c56e022cSJens Axboe { 630c56e022cSJens Axboe struct io_buffer_list *bl; 631c56e022cSJens Axboe 632c56e022cSJens Axboe bl = io_buffer_get_list(ctx, bgid); 633c56e022cSJens Axboe if (!bl || !bl->is_mmap) 634c56e022cSJens Axboe return NULL; 635c56e022cSJens Axboe 636c56e022cSJens Axboe return bl->buf_ring; 637c56e022cSJens Axboe } 638