13b77495aSJens Axboe // SPDX-License-Identifier: GPL-2.0 23b77495aSJens Axboe #include <linux/kernel.h> 33b77495aSJens Axboe #include <linux/errno.h> 43b77495aSJens Axboe #include <linux/fs.h> 53b77495aSJens Axboe #include <linux/file.h> 63b77495aSJens Axboe #include <linux/mm.h> 73b77495aSJens Axboe #include <linux/slab.h> 83b77495aSJens Axboe #include <linux/namei.h> 93b77495aSJens Axboe #include <linux/poll.h> 103b77495aSJens Axboe #include <linux/io_uring.h> 113b77495aSJens Axboe 123b77495aSJens Axboe #include <uapi/linux/io_uring.h> 133b77495aSJens Axboe 143b77495aSJens Axboe #include "io_uring.h" 153b77495aSJens Axboe #include "opdef.h" 163b77495aSJens Axboe #include "kbuf.h" 173b77495aSJens Axboe 183b77495aSJens Axboe #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) 193b77495aSJens Axboe 2046484864SGabriel Krisman Bertazi /* BIDs are addressed by a 16-bit field in a CQE */ 2146484864SGabriel Krisman Bertazi #define MAX_BIDS_PER_BGID (1 << 16) 2246484864SGabriel Krisman Bertazi 233b77495aSJens Axboe struct io_provide_buf { 243b77495aSJens Axboe struct file *file; 253b77495aSJens Axboe __u64 addr; 263b77495aSJens Axboe __u32 len; 273b77495aSJens Axboe __u32 bgid; 2846484864SGabriel Krisman Bertazi __u32 nbufs; 293b77495aSJens Axboe __u16 bid; 303b77495aSJens Axboe }; 313b77495aSJens Axboe 32d6e03f6dSJens Axboe static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, 3309f75200SJens Axboe unsigned int bgid) 3409f75200SJens Axboe { 3509f75200SJens Axboe return xa_load(&ctx->io_bl_xa, bgid); 3609f75200SJens Axboe } 3709f75200SJens Axboe 387138ebbeSJens Axboe struct io_buf_free { 397138ebbeSJens Axboe struct hlist_node list; 407138ebbeSJens Axboe void *mem; 419e1152a6SJens Axboe size_t size; 429e1152a6SJens Axboe int inuse; 437138ebbeSJens Axboe }; 447138ebbeSJens Axboe 453b77495aSJens Axboe static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 463b77495aSJens Axboe unsigned int bgid) 473b77495aSJens Axboe { 4809f75200SJens Axboe lockdep_assert_held(&ctx->uring_lock); 493b77495aSJens Axboe 50d6e03f6dSJens Axboe return __io_buffer_get_list(ctx, bgid); 513b77495aSJens Axboe } 523b77495aSJens Axboe 53024b8fdeSHao Xu static int io_buffer_add_list(struct io_ring_ctx *ctx, 54024b8fdeSHao Xu struct io_buffer_list *bl, unsigned int bgid) 55024b8fdeSHao Xu { 5609f75200SJens Axboe /* 5709f75200SJens Axboe * Store buffer group ID and finally mark the list as visible. 5809f75200SJens Axboe * The normal lookup doesn't care about the visibility as we're 5909f75200SJens Axboe * always under the ->uring_lock, but the RCU lookup from mmap does. 6009f75200SJens Axboe */ 61024b8fdeSHao Xu bl->bgid = bgid; 62b392402dSJens Axboe atomic_set(&bl->refs, 1); 63024b8fdeSHao Xu return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 64024b8fdeSHao Xu } 65024b8fdeSHao Xu 66024b8fdeSHao Xu void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) 673b77495aSJens Axboe { 683b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 693b77495aSJens Axboe struct io_buffer_list *bl; 703b77495aSJens Axboe struct io_buffer *buf; 713b77495aSJens Axboe 723b77495aSJens Axboe /* 73024b8fdeSHao Xu * For legacy provided buffer mode, don't recycle if we already did 74024b8fdeSHao Xu * IO to this buffer. For ring-mapped provided buffer mode, we should 75024b8fdeSHao Xu * increment ring->head to explicitly monopolize the buffer to avoid 76024b8fdeSHao Xu * multiple use. 77024b8fdeSHao Xu */ 78024b8fdeSHao Xu if (req->flags & REQ_F_PARTIAL_IO) 79024b8fdeSHao Xu return; 80024b8fdeSHao Xu 81024b8fdeSHao Xu io_ring_submit_lock(ctx, issue_flags); 82024b8fdeSHao Xu 83024b8fdeSHao Xu buf = req->kbuf; 84024b8fdeSHao Xu bl = io_buffer_get_list(ctx, buf->bgid); 85024b8fdeSHao Xu list_add(&buf->list, &bl->buf_list); 86024b8fdeSHao Xu req->flags &= ~REQ_F_BUFFER_SELECTED; 87024b8fdeSHao Xu req->buf_index = buf->bgid; 88024b8fdeSHao Xu 89024b8fdeSHao Xu io_ring_submit_unlock(ctx, issue_flags); 90024b8fdeSHao Xu return; 91024b8fdeSHao Xu } 92024b8fdeSHao Xu 9353ccf69bSPavel Begunkov unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) 9453ccf69bSPavel Begunkov { 9553ccf69bSPavel Begunkov unsigned int cflags; 9653ccf69bSPavel Begunkov 9753ccf69bSPavel Begunkov /* 9853ccf69bSPavel Begunkov * We can add this buffer back to two lists: 9953ccf69bSPavel Begunkov * 10053ccf69bSPavel Begunkov * 1) The io_buffers_cache list. This one is protected by the 10153ccf69bSPavel Begunkov * ctx->uring_lock. If we already hold this lock, add back to this 10253ccf69bSPavel Begunkov * list as we can grab it from issue as well. 10353ccf69bSPavel Begunkov * 2) The io_buffers_comp list. This one is protected by the 10453ccf69bSPavel Begunkov * ctx->completion_lock. 10553ccf69bSPavel Begunkov * 10653ccf69bSPavel Begunkov * We migrate buffers from the comp_list to the issue cache list 10753ccf69bSPavel Begunkov * when we need one. 10853ccf69bSPavel Begunkov */ 10953ccf69bSPavel Begunkov if (req->flags & REQ_F_BUFFER_RING) { 11053ccf69bSPavel Begunkov /* no buffers to recycle for this case */ 11153ccf69bSPavel Begunkov cflags = __io_put_kbuf_list(req, NULL); 11253ccf69bSPavel Begunkov } else if (issue_flags & IO_URING_F_UNLOCKED) { 11353ccf69bSPavel Begunkov struct io_ring_ctx *ctx = req->ctx; 11453ccf69bSPavel Begunkov 11553ccf69bSPavel Begunkov spin_lock(&ctx->completion_lock); 11653ccf69bSPavel Begunkov cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); 11753ccf69bSPavel Begunkov spin_unlock(&ctx->completion_lock); 11853ccf69bSPavel Begunkov } else { 11953ccf69bSPavel Begunkov lockdep_assert_held(&req->ctx->uring_lock); 12053ccf69bSPavel Begunkov 12153ccf69bSPavel Begunkov cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); 12253ccf69bSPavel Begunkov } 12353ccf69bSPavel Begunkov return cflags; 12453ccf69bSPavel Begunkov } 12553ccf69bSPavel Begunkov 1263b77495aSJens Axboe static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, 1273b77495aSJens Axboe struct io_buffer_list *bl) 1283b77495aSJens Axboe { 1293b77495aSJens Axboe if (!list_empty(&bl->buf_list)) { 1303b77495aSJens Axboe struct io_buffer *kbuf; 1313b77495aSJens Axboe 1323b77495aSJens Axboe kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); 1333b77495aSJens Axboe list_del(&kbuf->list); 134b8c01559SDylan Yudaken if (*len == 0 || *len > kbuf->len) 1353b77495aSJens Axboe *len = kbuf->len; 1363b77495aSJens Axboe req->flags |= REQ_F_BUFFER_SELECTED; 1373b77495aSJens Axboe req->kbuf = kbuf; 1383b77495aSJens Axboe req->buf_index = kbuf->bid; 1393b77495aSJens Axboe return u64_to_user_ptr(kbuf->addr); 1403b77495aSJens Axboe } 1413b77495aSJens Axboe return NULL; 1423b77495aSJens Axboe } 1433b77495aSJens Axboe 1443b77495aSJens Axboe static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 1453b77495aSJens Axboe struct io_buffer_list *bl, 1463b77495aSJens Axboe unsigned int issue_flags) 1473b77495aSJens Axboe { 1483b77495aSJens Axboe struct io_uring_buf_ring *br = bl->buf_ring; 1493b77495aSJens Axboe struct io_uring_buf *buf; 1503b77495aSJens Axboe __u16 head = bl->head; 1513b77495aSJens Axboe 1523b77495aSJens Axboe if (unlikely(smp_load_acquire(&br->tail) == head)) 1533b77495aSJens Axboe return NULL; 1543b77495aSJens Axboe 1553b77495aSJens Axboe head &= bl->mask; 156c56e022cSJens Axboe /* mmaped buffers are always contig */ 157c56e022cSJens Axboe if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { 1583b77495aSJens Axboe buf = &br->bufs[head]; 1593b77495aSJens Axboe } else { 1603b77495aSJens Axboe int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); 1613b77495aSJens Axboe int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; 1623b77495aSJens Axboe buf = page_address(bl->buf_pages[index]); 1633b77495aSJens Axboe buf += off; 1643b77495aSJens Axboe } 165b8c01559SDylan Yudaken if (*len == 0 || *len > buf->len) 1663b77495aSJens Axboe *len = buf->len; 1673b77495aSJens Axboe req->flags |= REQ_F_BUFFER_RING; 1683b77495aSJens Axboe req->buf_list = bl; 1693b77495aSJens Axboe req->buf_index = buf->bid; 1703b77495aSJens Axboe 17143cfac7bSJens Axboe if (issue_flags & IO_URING_F_UNLOCKED || 17243cfac7bSJens Axboe (req->file && !file_can_poll(req->file))) { 1733b77495aSJens Axboe /* 1743b77495aSJens Axboe * If we came in unlocked, we have no choice but to consume the 175f09c8643SHao Xu * buffer here, otherwise nothing ensures that the buffer won't 176f09c8643SHao Xu * get used by others. This does mean it'll be pinned until the 177f09c8643SHao Xu * IO completes, coming in unlocked means we're being called from 178f09c8643SHao Xu * io-wq context and there may be further retries in async hybrid 179f09c8643SHao Xu * mode. For the locked case, the caller must call commit when 180f09c8643SHao Xu * the transfer completes (or if we get -EAGAIN and must poll of 181f09c8643SHao Xu * retry). 1823b77495aSJens Axboe */ 1833b77495aSJens Axboe req->buf_list = NULL; 1843b77495aSJens Axboe bl->head++; 1853b77495aSJens Axboe } 1863b77495aSJens Axboe return u64_to_user_ptr(buf->addr); 1873b77495aSJens Axboe } 1883b77495aSJens Axboe 1893b77495aSJens Axboe void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 1903b77495aSJens Axboe unsigned int issue_flags) 1913b77495aSJens Axboe { 1923b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 1933b77495aSJens Axboe struct io_buffer_list *bl; 1943b77495aSJens Axboe void __user *ret = NULL; 1953b77495aSJens Axboe 1963b77495aSJens Axboe io_ring_submit_lock(req->ctx, issue_flags); 1973b77495aSJens Axboe 1983b77495aSJens Axboe bl = io_buffer_get_list(ctx, req->buf_index); 1993b77495aSJens Axboe if (likely(bl)) { 20025a2c188SJens Axboe if (bl->is_mapped) 2013b77495aSJens Axboe ret = io_ring_buffer_select(req, len, bl, issue_flags); 2023b77495aSJens Axboe else 2033b77495aSJens Axboe ret = io_provided_buffer_select(req, len, bl); 2043b77495aSJens Axboe } 2053b77495aSJens Axboe io_ring_submit_unlock(req->ctx, issue_flags); 2063b77495aSJens Axboe return ret; 2073b77495aSJens Axboe } 2083b77495aSJens Axboe 2099e1152a6SJens Axboe /* 2109e1152a6SJens Axboe * Mark the given mapped range as free for reuse 2119e1152a6SJens Axboe */ 2129e1152a6SJens Axboe static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) 2139e1152a6SJens Axboe { 2149e1152a6SJens Axboe struct io_buf_free *ibf; 2159e1152a6SJens Axboe 2169e1152a6SJens Axboe hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { 2179e1152a6SJens Axboe if (bl->buf_ring == ibf->mem) { 2189e1152a6SJens Axboe ibf->inuse = 0; 2199e1152a6SJens Axboe return; 2209e1152a6SJens Axboe } 2219e1152a6SJens Axboe } 2229e1152a6SJens Axboe 2239e1152a6SJens Axboe /* can't happen... */ 2249e1152a6SJens Axboe WARN_ON_ONCE(1); 2259e1152a6SJens Axboe } 2269e1152a6SJens Axboe 2273b77495aSJens Axboe static int __io_remove_buffers(struct io_ring_ctx *ctx, 2283b77495aSJens Axboe struct io_buffer_list *bl, unsigned nbufs) 2293b77495aSJens Axboe { 2303b77495aSJens Axboe unsigned i = 0; 2313b77495aSJens Axboe 2323b77495aSJens Axboe /* shouldn't happen */ 2333b77495aSJens Axboe if (!nbufs) 2343b77495aSJens Axboe return 0; 2353b77495aSJens Axboe 236c56e022cSJens Axboe if (bl->is_mapped) { 237c56e022cSJens Axboe i = bl->buf_ring->tail - bl->head; 238c56e022cSJens Axboe if (bl->is_mmap) { 2397138ebbeSJens Axboe /* 2407138ebbeSJens Axboe * io_kbuf_list_free() will free the page(s) at 2417138ebbeSJens Axboe * ->release() time. 2427138ebbeSJens Axboe */ 2439e1152a6SJens Axboe io_kbuf_mark_free(ctx, bl); 244c56e022cSJens Axboe bl->buf_ring = NULL; 245c56e022cSJens Axboe bl->is_mmap = 0; 246c56e022cSJens Axboe } else if (bl->buf_nr_pages) { 2473b77495aSJens Axboe int j; 2483b77495aSJens Axboe 2493b77495aSJens Axboe for (j = 0; j < bl->buf_nr_pages; j++) 2503b77495aSJens Axboe unpin_user_page(bl->buf_pages[j]); 2513b77495aSJens Axboe kvfree(bl->buf_pages); 2523b77495aSJens Axboe bl->buf_pages = NULL; 2533b77495aSJens Axboe bl->buf_nr_pages = 0; 254c56e022cSJens Axboe } 2553b77495aSJens Axboe /* make sure it's seen as empty */ 2563b77495aSJens Axboe INIT_LIST_HEAD(&bl->buf_list); 25725a2c188SJens Axboe bl->is_mapped = 0; 2583b77495aSJens Axboe return i; 2593b77495aSJens Axboe } 2603b77495aSJens Axboe 261b4a72c05SWojciech Lukowicz /* protects io_buffers_cache */ 262b4a72c05SWojciech Lukowicz lockdep_assert_held(&ctx->uring_lock); 263b4a72c05SWojciech Lukowicz 2643b77495aSJens Axboe while (!list_empty(&bl->buf_list)) { 2653b77495aSJens Axboe struct io_buffer *nxt; 2663b77495aSJens Axboe 2673b77495aSJens Axboe nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); 268b4a72c05SWojciech Lukowicz list_move(&nxt->list, &ctx->io_buffers_cache); 2693b77495aSJens Axboe if (++i == nbufs) 2703b77495aSJens Axboe return i; 2713b77495aSJens Axboe cond_resched(); 2723b77495aSJens Axboe } 2733b77495aSJens Axboe 2743b77495aSJens Axboe return i; 2753b77495aSJens Axboe } 2763b77495aSJens Axboe 27765938e81SJens Axboe void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) 278b392402dSJens Axboe { 279b392402dSJens Axboe if (atomic_dec_and_test(&bl->refs)) { 280b392402dSJens Axboe __io_remove_buffers(ctx, bl, -1U); 281b392402dSJens Axboe kfree_rcu(bl, rcu); 282b392402dSJens Axboe } 283b392402dSJens Axboe } 284b392402dSJens Axboe 2853b77495aSJens Axboe void io_destroy_buffers(struct io_ring_ctx *ctx) 2863b77495aSJens Axboe { 2873b77495aSJens Axboe struct io_buffer_list *bl; 2883b77495aSJens Axboe unsigned long index; 2893b77495aSJens Axboe 2903b77495aSJens Axboe xa_for_each(&ctx->io_bl_xa, index, bl) { 2913b77495aSJens Axboe xa_erase(&ctx->io_bl_xa, bl->bgid); 292b392402dSJens Axboe io_put_bl(ctx, bl); 2933b77495aSJens Axboe } 2943b77495aSJens Axboe 2953b77495aSJens Axboe while (!list_empty(&ctx->io_buffers_pages)) { 2963b77495aSJens Axboe struct page *page; 2973b77495aSJens Axboe 2983b77495aSJens Axboe page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); 2993b77495aSJens Axboe list_del_init(&page->lru); 3003b77495aSJens Axboe __free_page(page); 3013b77495aSJens Axboe } 3023b77495aSJens Axboe } 3033b77495aSJens Axboe 304*146a185fSPavel Begunkov static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) 305*146a185fSPavel Begunkov { 306*146a185fSPavel Begunkov xa_erase(&ctx->io_bl_xa, bl->bgid); 307*146a185fSPavel Begunkov io_put_bl(ctx, bl); 308*146a185fSPavel Begunkov } 309*146a185fSPavel Begunkov 3103b77495aSJens Axboe int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3113b77495aSJens Axboe { 312f2ccb5aeSStefan Metzmacher struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 3133b77495aSJens Axboe u64 tmp; 3143b77495aSJens Axboe 3153b77495aSJens Axboe if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 3163b77495aSJens Axboe sqe->splice_fd_in) 3173b77495aSJens Axboe return -EINVAL; 3183b77495aSJens Axboe 3193b77495aSJens Axboe tmp = READ_ONCE(sqe->fd); 32046484864SGabriel Krisman Bertazi if (!tmp || tmp > MAX_BIDS_PER_BGID) 3213b77495aSJens Axboe return -EINVAL; 3223b77495aSJens Axboe 3233b77495aSJens Axboe memset(p, 0, sizeof(*p)); 3243b77495aSJens Axboe p->nbufs = tmp; 3253b77495aSJens Axboe p->bgid = READ_ONCE(sqe->buf_group); 3263b77495aSJens Axboe return 0; 3273b77495aSJens Axboe } 3283b77495aSJens Axboe 3293b77495aSJens Axboe int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 3303b77495aSJens Axboe { 331f2ccb5aeSStefan Metzmacher struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 3323b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 3333b77495aSJens Axboe struct io_buffer_list *bl; 3343b77495aSJens Axboe int ret = 0; 3353b77495aSJens Axboe 3363b77495aSJens Axboe io_ring_submit_lock(ctx, issue_flags); 3373b77495aSJens Axboe 3383b77495aSJens Axboe ret = -ENOENT; 3393b77495aSJens Axboe bl = io_buffer_get_list(ctx, p->bgid); 3403b77495aSJens Axboe if (bl) { 3413b77495aSJens Axboe ret = -EINVAL; 3423b77495aSJens Axboe /* can't use provide/remove buffers command on mapped buffers */ 34325a2c188SJens Axboe if (!bl->is_mapped) 3443b77495aSJens Axboe ret = __io_remove_buffers(ctx, bl, p->nbufs); 3453b77495aSJens Axboe } 346c3b49093SPavel Begunkov io_ring_submit_unlock(ctx, issue_flags); 3473b77495aSJens Axboe if (ret < 0) 3483b77495aSJens Axboe req_set_fail(req); 3493b77495aSJens Axboe io_req_set_res(req, ret, 0); 350c3b49093SPavel Begunkov return IOU_OK; 3513b77495aSJens Axboe } 3523b77495aSJens Axboe 3533b77495aSJens Axboe int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3543b77495aSJens Axboe { 3553b77495aSJens Axboe unsigned long size, tmp_check; 356f2ccb5aeSStefan Metzmacher struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 3573b77495aSJens Axboe u64 tmp; 3583b77495aSJens Axboe 3593b77495aSJens Axboe if (sqe->rw_flags || sqe->splice_fd_in) 3603b77495aSJens Axboe return -EINVAL; 3613b77495aSJens Axboe 3623b77495aSJens Axboe tmp = READ_ONCE(sqe->fd); 36346484864SGabriel Krisman Bertazi if (!tmp || tmp > MAX_BIDS_PER_BGID) 3643b77495aSJens Axboe return -E2BIG; 3653b77495aSJens Axboe p->nbufs = tmp; 3663b77495aSJens Axboe p->addr = READ_ONCE(sqe->addr); 3673b77495aSJens Axboe p->len = READ_ONCE(sqe->len); 3683b77495aSJens Axboe 3693b77495aSJens Axboe if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 3703b77495aSJens Axboe &size)) 3713b77495aSJens Axboe return -EOVERFLOW; 3723b77495aSJens Axboe if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 3733b77495aSJens Axboe return -EOVERFLOW; 3743b77495aSJens Axboe 3753b77495aSJens Axboe size = (unsigned long)p->len * p->nbufs; 3763b77495aSJens Axboe if (!access_ok(u64_to_user_ptr(p->addr), size)) 3773b77495aSJens Axboe return -EFAULT; 3783b77495aSJens Axboe 3793b77495aSJens Axboe p->bgid = READ_ONCE(sqe->buf_group); 3803b77495aSJens Axboe tmp = READ_ONCE(sqe->off); 3813b77495aSJens Axboe if (tmp > USHRT_MAX) 3823b77495aSJens Axboe return -E2BIG; 38346484864SGabriel Krisman Bertazi if (tmp + p->nbufs > MAX_BIDS_PER_BGID) 3843851d25cSJens Axboe return -EINVAL; 3853b77495aSJens Axboe p->bid = tmp; 3863b77495aSJens Axboe return 0; 3873b77495aSJens Axboe } 3883b77495aSJens Axboe 3893b77495aSJens Axboe static int io_refill_buffer_cache(struct io_ring_ctx *ctx) 3903b77495aSJens Axboe { 3913b77495aSJens Axboe struct io_buffer *buf; 3923b77495aSJens Axboe struct page *page; 3933b77495aSJens Axboe int bufs_in_page; 3943b77495aSJens Axboe 3953b77495aSJens Axboe /* 3963b77495aSJens Axboe * Completions that don't happen inline (eg not under uring_lock) will 3973b77495aSJens Axboe * add to ->io_buffers_comp. If we don't have any free buffers, check 3983b77495aSJens Axboe * the completion list and splice those entries first. 3993b77495aSJens Axboe */ 4003b77495aSJens Axboe if (!list_empty_careful(&ctx->io_buffers_comp)) { 4013b77495aSJens Axboe spin_lock(&ctx->completion_lock); 4023b77495aSJens Axboe if (!list_empty(&ctx->io_buffers_comp)) { 4033b77495aSJens Axboe list_splice_init(&ctx->io_buffers_comp, 4043b77495aSJens Axboe &ctx->io_buffers_cache); 4053b77495aSJens Axboe spin_unlock(&ctx->completion_lock); 4063b77495aSJens Axboe return 0; 4073b77495aSJens Axboe } 4083b77495aSJens Axboe spin_unlock(&ctx->completion_lock); 4093b77495aSJens Axboe } 4103b77495aSJens Axboe 4113b77495aSJens Axboe /* 4123b77495aSJens Axboe * No free buffers and no completion entries either. Allocate a new 4133b77495aSJens Axboe * page worth of buffer entries and add those to our freelist. 4143b77495aSJens Axboe */ 4153b77495aSJens Axboe page = alloc_page(GFP_KERNEL_ACCOUNT); 4163b77495aSJens Axboe if (!page) 4173b77495aSJens Axboe return -ENOMEM; 4183b77495aSJens Axboe 4193b77495aSJens Axboe list_add(&page->lru, &ctx->io_buffers_pages); 4203b77495aSJens Axboe 4213b77495aSJens Axboe buf = page_address(page); 4223b77495aSJens Axboe bufs_in_page = PAGE_SIZE / sizeof(*buf); 4233b77495aSJens Axboe while (bufs_in_page) { 4243b77495aSJens Axboe list_add_tail(&buf->list, &ctx->io_buffers_cache); 4253b77495aSJens Axboe buf++; 4263b77495aSJens Axboe bufs_in_page--; 4273b77495aSJens Axboe } 4283b77495aSJens Axboe 4293b77495aSJens Axboe return 0; 4303b77495aSJens Axboe } 4313b77495aSJens Axboe 4323b77495aSJens Axboe static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, 4333b77495aSJens Axboe struct io_buffer_list *bl) 4343b77495aSJens Axboe { 4353b77495aSJens Axboe struct io_buffer *buf; 4363b77495aSJens Axboe u64 addr = pbuf->addr; 4373b77495aSJens Axboe int i, bid = pbuf->bid; 4383b77495aSJens Axboe 4393b77495aSJens Axboe for (i = 0; i < pbuf->nbufs; i++) { 4403b77495aSJens Axboe if (list_empty(&ctx->io_buffers_cache) && 4413b77495aSJens Axboe io_refill_buffer_cache(ctx)) 4423b77495aSJens Axboe break; 4433b77495aSJens Axboe buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, 4443b77495aSJens Axboe list); 4453b77495aSJens Axboe list_move_tail(&buf->list, &bl->buf_list); 4463b77495aSJens Axboe buf->addr = addr; 4473b77495aSJens Axboe buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 4483b77495aSJens Axboe buf->bid = bid; 4493b77495aSJens Axboe buf->bgid = pbuf->bgid; 4503b77495aSJens Axboe addr += pbuf->len; 4513b77495aSJens Axboe bid++; 4523b77495aSJens Axboe cond_resched(); 4533b77495aSJens Axboe } 4543b77495aSJens Axboe 4553b77495aSJens Axboe return i ? 0 : -ENOMEM; 4563b77495aSJens Axboe } 4573b77495aSJens Axboe 4583b77495aSJens Axboe int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 4593b77495aSJens Axboe { 460f2ccb5aeSStefan Metzmacher struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 4613b77495aSJens Axboe struct io_ring_ctx *ctx = req->ctx; 4623b77495aSJens Axboe struct io_buffer_list *bl; 4633b77495aSJens Axboe int ret = 0; 4643b77495aSJens Axboe 4653b77495aSJens Axboe io_ring_submit_lock(ctx, issue_flags); 4663b77495aSJens Axboe 4673b77495aSJens Axboe bl = io_buffer_get_list(ctx, p->bgid); 4683b77495aSJens Axboe if (unlikely(!bl)) { 469cc18cc5eSPavel Begunkov bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); 4703b77495aSJens Axboe if (!bl) { 4713b77495aSJens Axboe ret = -ENOMEM; 4723b77495aSJens Axboe goto err; 4733b77495aSJens Axboe } 4743b77495aSJens Axboe INIT_LIST_HEAD(&bl->buf_list); 4753b77495aSJens Axboe ret = io_buffer_add_list(ctx, bl, p->bgid); 4763b77495aSJens Axboe if (ret) { 47709f75200SJens Axboe /* 47809f75200SJens Axboe * Doesn't need rcu free as it was never visible, but 479d6e03f6dSJens Axboe * let's keep it consistent throughout. 48009f75200SJens Axboe */ 48109f75200SJens Axboe kfree_rcu(bl, rcu); 4823b77495aSJens Axboe goto err; 4833b77495aSJens Axboe } 4843b77495aSJens Axboe } 4853b77495aSJens Axboe /* can't add buffers via this command for a mapped buffer ring */ 48625a2c188SJens Axboe if (bl->is_mapped) { 4873b77495aSJens Axboe ret = -EINVAL; 4883b77495aSJens Axboe goto err; 4893b77495aSJens Axboe } 4903b77495aSJens Axboe 4913b77495aSJens Axboe ret = io_add_buffers(ctx, p, bl); 4923b77495aSJens Axboe err: 493c3b49093SPavel Begunkov io_ring_submit_unlock(ctx, issue_flags); 494c3b49093SPavel Begunkov 4953b77495aSJens Axboe if (ret < 0) 4963b77495aSJens Axboe req_set_fail(req); 4973b77495aSJens Axboe io_req_set_res(req, ret, 0); 498c3b49093SPavel Begunkov return IOU_OK; 4993b77495aSJens Axboe } 5003b77495aSJens Axboe 501ba56b632SJens Axboe static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, 502ba56b632SJens Axboe struct io_buffer_list *bl) 5033b77495aSJens Axboe { 5043b77495aSJens Axboe struct io_uring_buf_ring *br; 5053b77495aSJens Axboe struct page **pages; 506f8024f1fSJens Axboe int i, nr_pages; 5073b77495aSJens Axboe 508ba56b632SJens Axboe pages = io_pin_pages(reg->ring_addr, 509ba56b632SJens Axboe flex_array_size(br, bufs, reg->ring_entries), 510ba56b632SJens Axboe &nr_pages); 511ba56b632SJens Axboe if (IS_ERR(pages)) 512ba56b632SJens Axboe return PTR_ERR(pages); 513ba56b632SJens Axboe 514f8024f1fSJens Axboe /* 515f8024f1fSJens Axboe * Apparently some 32-bit boxes (ARM) will return highmem pages, 516f8024f1fSJens Axboe * which then need to be mapped. We could support that, but it'd 517f8024f1fSJens Axboe * complicate the code and slowdown the common cases quite a bit. 518f8024f1fSJens Axboe * So just error out, returning -EINVAL just like we did on kernels 519f8024f1fSJens Axboe * that didn't support mapped buffer rings. 520f8024f1fSJens Axboe */ 521f8024f1fSJens Axboe for (i = 0; i < nr_pages; i++) 522f8024f1fSJens Axboe if (PageHighMem(pages[i])) 523f8024f1fSJens Axboe goto error_unpin; 524f8024f1fSJens Axboe 525ba56b632SJens Axboe br = page_address(pages[0]); 526fcb46c0cSJens Axboe #ifdef SHM_COLOUR 527fcb46c0cSJens Axboe /* 528fcb46c0cSJens Axboe * On platforms that have specific aliasing requirements, SHM_COLOUR 529fcb46c0cSJens Axboe * is set and we must guarantee that the kernel and user side align 530fcb46c0cSJens Axboe * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and 531fcb46c0cSJens Axboe * the application mmap's the provided ring buffer. Fail the request 532fcb46c0cSJens Axboe * if we, by chance, don't end up with aligned addresses. The app 533fcb46c0cSJens Axboe * should use IOU_PBUF_RING_MMAP instead, and liburing will handle 534fcb46c0cSJens Axboe * this transparently. 535fcb46c0cSJens Axboe */ 536f8024f1fSJens Axboe if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) 537f8024f1fSJens Axboe goto error_unpin; 538fcb46c0cSJens Axboe #endif 539ba56b632SJens Axboe bl->buf_pages = pages; 540ba56b632SJens Axboe bl->buf_nr_pages = nr_pages; 541ba56b632SJens Axboe bl->buf_ring = br; 54225a2c188SJens Axboe bl->is_mapped = 1; 543c56e022cSJens Axboe bl->is_mmap = 0; 544c56e022cSJens Axboe return 0; 545f8024f1fSJens Axboe error_unpin: 546f8024f1fSJens Axboe for (i = 0; i < nr_pages; i++) 547f8024f1fSJens Axboe unpin_user_page(pages[i]); 548f8024f1fSJens Axboe kvfree(pages); 549f8024f1fSJens Axboe return -EINVAL; 550c56e022cSJens Axboe } 551c56e022cSJens Axboe 5529e1152a6SJens Axboe /* 5539e1152a6SJens Axboe * See if we have a suitable region that we can reuse, rather than allocate 5549e1152a6SJens Axboe * both a new io_buf_free and mem region again. We leave it on the list as 5559e1152a6SJens Axboe * even a reused entry will need freeing at ring release. 5569e1152a6SJens Axboe */ 5579e1152a6SJens Axboe static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, 5589e1152a6SJens Axboe size_t ring_size) 5599e1152a6SJens Axboe { 5609e1152a6SJens Axboe struct io_buf_free *ibf, *best = NULL; 5619e1152a6SJens Axboe size_t best_dist; 5629e1152a6SJens Axboe 5639e1152a6SJens Axboe hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { 5649e1152a6SJens Axboe size_t dist; 5659e1152a6SJens Axboe 5669e1152a6SJens Axboe if (ibf->inuse || ibf->size < ring_size) 5679e1152a6SJens Axboe continue; 5689e1152a6SJens Axboe dist = ibf->size - ring_size; 5699e1152a6SJens Axboe if (!best || dist < best_dist) { 5709e1152a6SJens Axboe best = ibf; 5719e1152a6SJens Axboe if (!dist) 5729e1152a6SJens Axboe break; 5739e1152a6SJens Axboe best_dist = dist; 5749e1152a6SJens Axboe } 5759e1152a6SJens Axboe } 5769e1152a6SJens Axboe 5779e1152a6SJens Axboe return best; 5789e1152a6SJens Axboe } 5799e1152a6SJens Axboe 5807138ebbeSJens Axboe static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, 5817138ebbeSJens Axboe struct io_uring_buf_reg *reg, 582c56e022cSJens Axboe struct io_buffer_list *bl) 583c56e022cSJens Axboe { 5847138ebbeSJens Axboe struct io_buf_free *ibf; 585c56e022cSJens Axboe size_t ring_size; 586c56e022cSJens Axboe void *ptr; 587c56e022cSJens Axboe 588c56e022cSJens Axboe ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); 5899e1152a6SJens Axboe 5909e1152a6SJens Axboe /* Reuse existing entry, if we can */ 5919e1152a6SJens Axboe ibf = io_lookup_buf_free_entry(ctx, ring_size); 5929e1152a6SJens Axboe if (!ibf) { 5937138ebbeSJens Axboe ptr = io_mem_alloc(ring_size); 594b2173a8bSDan Carpenter if (IS_ERR(ptr)) 595b2173a8bSDan Carpenter return PTR_ERR(ptr); 596c56e022cSJens Axboe 5977138ebbeSJens Axboe /* Allocate and store deferred free entry */ 5987138ebbeSJens Axboe ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); 5997138ebbeSJens Axboe if (!ibf) { 6007138ebbeSJens Axboe io_mem_free(ptr); 6017138ebbeSJens Axboe return -ENOMEM; 6027138ebbeSJens Axboe } 6037138ebbeSJens Axboe ibf->mem = ptr; 6049e1152a6SJens Axboe ibf->size = ring_size; 6057138ebbeSJens Axboe hlist_add_head(&ibf->list, &ctx->io_buf_list); 6069e1152a6SJens Axboe } 6079e1152a6SJens Axboe ibf->inuse = 1; 6089e1152a6SJens Axboe bl->buf_ring = ibf->mem; 609c56e022cSJens Axboe bl->is_mapped = 1; 610c56e022cSJens Axboe bl->is_mmap = 1; 611ba56b632SJens Axboe return 0; 612ba56b632SJens Axboe } 613ba56b632SJens Axboe 614ba56b632SJens Axboe int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 615ba56b632SJens Axboe { 616ba56b632SJens Axboe struct io_uring_buf_reg reg; 617ba56b632SJens Axboe struct io_buffer_list *bl, *free_bl = NULL; 618ba56b632SJens Axboe int ret; 619ba56b632SJens Axboe 62009f75200SJens Axboe lockdep_assert_held(&ctx->uring_lock); 62109f75200SJens Axboe 6223b77495aSJens Axboe if (copy_from_user(®, arg, sizeof(reg))) 6233b77495aSJens Axboe return -EFAULT; 6243b77495aSJens Axboe 62581cf17cdSJens Axboe if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 6263b77495aSJens Axboe return -EINVAL; 627c56e022cSJens Axboe if (reg.flags & ~IOU_PBUF_RING_MMAP) 6283b77495aSJens Axboe return -EINVAL; 629c56e022cSJens Axboe if (!(reg.flags & IOU_PBUF_RING_MMAP)) { 6303b77495aSJens Axboe if (!reg.ring_addr) 6313b77495aSJens Axboe return -EFAULT; 6323b77495aSJens Axboe if (reg.ring_addr & ~PAGE_MASK) 6333b77495aSJens Axboe return -EINVAL; 634c56e022cSJens Axboe } else { 635c56e022cSJens Axboe if (reg.ring_addr) 636c56e022cSJens Axboe return -EINVAL; 637c56e022cSJens Axboe } 638c56e022cSJens Axboe 6393b77495aSJens Axboe if (!is_power_of_2(reg.ring_entries)) 6403b77495aSJens Axboe return -EINVAL; 6413b77495aSJens Axboe 6423b77495aSJens Axboe /* cannot disambiguate full vs empty due to head/tail size */ 6433b77495aSJens Axboe if (reg.ring_entries >= 65536) 6443b77495aSJens Axboe return -EINVAL; 6453b77495aSJens Axboe 6463b77495aSJens Axboe bl = io_buffer_get_list(ctx, reg.bgid); 6473b77495aSJens Axboe if (bl) { 6483b77495aSJens Axboe /* if mapped buffer ring OR classic exists, don't allow */ 64925a2c188SJens Axboe if (bl->is_mapped || !list_empty(&bl->buf_list)) 6503b77495aSJens Axboe return -EEXIST; 651*146a185fSPavel Begunkov io_destroy_bl(ctx, bl); 652*146a185fSPavel Begunkov } 653*146a185fSPavel Begunkov 6543b77495aSJens Axboe free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); 6553b77495aSJens Axboe if (!bl) 6563b77495aSJens Axboe return -ENOMEM; 6573b77495aSJens Axboe 658c56e022cSJens Axboe if (!(reg.flags & IOU_PBUF_RING_MMAP)) 659ba56b632SJens Axboe ret = io_pin_pbuf_ring(®, bl); 660c56e022cSJens Axboe else 6617138ebbeSJens Axboe ret = io_alloc_pbuf_ring(ctx, ®, bl); 6623b77495aSJens Axboe 663c56e022cSJens Axboe if (!ret) { 6643b77495aSJens Axboe bl->nr_entries = reg.ring_entries; 6653b77495aSJens Axboe bl->mask = reg.ring_entries - 1; 666ba56b632SJens Axboe 6673b77495aSJens Axboe io_buffer_add_list(ctx, bl, reg.bgid); 6683b77495aSJens Axboe return 0; 6693b77495aSJens Axboe } 6703b77495aSJens Axboe 67109f75200SJens Axboe kfree_rcu(free_bl, rcu); 672c56e022cSJens Axboe return ret; 673c56e022cSJens Axboe } 674c56e022cSJens Axboe 6753b77495aSJens Axboe int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 6763b77495aSJens Axboe { 6773b77495aSJens Axboe struct io_uring_buf_reg reg; 6783b77495aSJens Axboe struct io_buffer_list *bl; 6793b77495aSJens Axboe 68009f75200SJens Axboe lockdep_assert_held(&ctx->uring_lock); 68109f75200SJens Axboe 6823b77495aSJens Axboe if (copy_from_user(®, arg, sizeof(reg))) 6833b77495aSJens Axboe return -EFAULT; 68481cf17cdSJens Axboe if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 68581cf17cdSJens Axboe return -EINVAL; 68681cf17cdSJens Axboe if (reg.flags) 6873b77495aSJens Axboe return -EINVAL; 6883b77495aSJens Axboe 6893b77495aSJens Axboe bl = io_buffer_get_list(ctx, reg.bgid); 6903b77495aSJens Axboe if (!bl) 6913b77495aSJens Axboe return -ENOENT; 69225a2c188SJens Axboe if (!bl->is_mapped) 6933b77495aSJens Axboe return -EINVAL; 6943b77495aSJens Axboe 6953b77495aSJens Axboe xa_erase(&ctx->io_bl_xa, bl->bgid); 696b392402dSJens Axboe io_put_bl(ctx, bl); 6973b77495aSJens Axboe return 0; 6983b77495aSJens Axboe } 699c56e022cSJens Axboe 70065938e81SJens Axboe struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, 70165938e81SJens Axboe unsigned long bgid) 702c56e022cSJens Axboe { 703c56e022cSJens Axboe struct io_buffer_list *bl; 70465938e81SJens Axboe bool ret; 705c56e022cSJens Axboe 70665938e81SJens Axboe /* 70765938e81SJens Axboe * We have to be a bit careful here - we're inside mmap and cannot grab 70865938e81SJens Axboe * the uring_lock. This means the buffer_list could be simultaneously 70965938e81SJens Axboe * going away, if someone is trying to be sneaky. Look it up under rcu 71065938e81SJens Axboe * so we know it's not going away, and attempt to grab a reference to 71165938e81SJens Axboe * it. If the ref is already zero, then fail the mapping. If successful, 71265938e81SJens Axboe * the caller will call io_put_bl() to drop the the reference at at the 71365938e81SJens Axboe * end. This may then safely free the buffer_list (and drop the pages) 71465938e81SJens Axboe * at that point, vm_insert_pages() would've already grabbed the 71565938e81SJens Axboe * necessary vma references. 71665938e81SJens Axboe */ 71765938e81SJens Axboe rcu_read_lock(); 71865938e81SJens Axboe bl = xa_load(&ctx->io_bl_xa, bgid); 71965938e81SJens Axboe /* must be a mmap'able buffer ring and have pages */ 72065938e81SJens Axboe ret = false; 72165938e81SJens Axboe if (bl && bl->is_mmap) 72265938e81SJens Axboe ret = atomic_inc_not_zero(&bl->refs); 72365938e81SJens Axboe rcu_read_unlock(); 72409f75200SJens Axboe 72565938e81SJens Axboe if (ret) 72665938e81SJens Axboe return bl; 727c56e022cSJens Axboe 72865938e81SJens Axboe return ERR_PTR(-EINVAL); 729c56e022cSJens Axboe } 7307138ebbeSJens Axboe 7317138ebbeSJens Axboe /* 7327138ebbeSJens Axboe * Called at or after ->release(), free the mmap'ed buffers that we used 7337138ebbeSJens Axboe * for memory mapped provided buffer rings. 7347138ebbeSJens Axboe */ 7357138ebbeSJens Axboe void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) 7367138ebbeSJens Axboe { 7377138ebbeSJens Axboe struct io_buf_free *ibf; 7387138ebbeSJens Axboe struct hlist_node *tmp; 7397138ebbeSJens Axboe 7407138ebbeSJens Axboe hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { 7417138ebbeSJens Axboe hlist_del(&ibf->list); 7427138ebbeSJens Axboe io_mem_free(ibf->mem); 7437138ebbeSJens Axboe kfree(ibf); 7447138ebbeSJens Axboe } 7457138ebbeSJens Axboe } 746