1 /************************************************************************** 2 * 3 * Copyright © 2017 VMware, Inc., Palo Alto, CA., USA 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 21 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 24 * USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 #include "vmwgfx_drv.h" 29 30 /* 31 * Template that implements find_first_diff() for a generic 32 * unsigned integer type. @size and return value are in bytes. 33 */ 34 #define VMW_FIND_FIRST_DIFF(_type) \ 35 static size_t vmw_find_first_diff_ ## _type \ 36 (const _type * dst, const _type * src, size_t size)\ 37 { \ 38 size_t i; \ 39 \ 40 for (i = 0; i < size; i += sizeof(_type)) { \ 41 if (*dst++ != *src++) \ 42 break; \ 43 } \ 44 \ 45 return i; \ 46 } 47 48 49 /* 50 * Template that implements find_last_diff() for a generic 51 * unsigned integer type. Pointers point to the item following the 52 * *end* of the area to be examined. @size and return value are in 53 * bytes. 54 */ 55 #define VMW_FIND_LAST_DIFF(_type) \ 56 static ssize_t vmw_find_last_diff_ ## _type( \ 57 const _type * dst, const _type * src, size_t size) \ 58 { \ 59 while (size) { \ 60 if (*--dst != *--src) \ 61 break; \ 62 \ 63 size -= sizeof(_type); \ 64 } \ 65 return size; \ 66 } 67 68 69 /* 70 * Instantiate find diff functions for relevant unsigned integer sizes, 71 * assuming that wider integers are faster (including aligning) up to the 72 * architecture native width, which is assumed to be 32 bit unless 73 * CONFIG_64BIT is defined. 74 */ 75 VMW_FIND_FIRST_DIFF(u8); 76 VMW_FIND_LAST_DIFF(u8); 77 78 VMW_FIND_FIRST_DIFF(u16); 79 VMW_FIND_LAST_DIFF(u16); 80 81 VMW_FIND_FIRST_DIFF(u32); 82 VMW_FIND_LAST_DIFF(u32); 83 84 #ifdef CONFIG_64BIT 85 VMW_FIND_FIRST_DIFF(u64); 86 VMW_FIND_LAST_DIFF(u64); 87 #endif 88 89 90 /* We use size aligned copies. This computes (addr - align(addr)) */ 91 #define SPILL(_var, _type) ((unsigned long) _var & (sizeof(_type) - 1)) 92 93 94 /* 95 * Template to compute find_first_diff() for a certain integer type 96 * including a head copy for alignment, and adjustment of parameters 97 * for tail find or increased resolution find using an unsigned integer find 98 * of smaller width. If finding is complete, and resolution is sufficient, 99 * the macro executes a return statement. Otherwise it falls through. 100 */ 101 #define VMW_TRY_FIND_FIRST_DIFF(_type) \ 102 do { \ 103 unsigned int spill = SPILL(dst, _type); \ 104 size_t diff_offs; \ 105 \ 106 if (spill && spill == SPILL(src, _type) && \ 107 sizeof(_type) - spill <= size) { \ 108 spill = sizeof(_type) - spill; \ 109 diff_offs = vmw_find_first_diff_u8(dst, src, spill); \ 110 if (diff_offs < spill) \ 111 return round_down(offset + diff_offs, granularity); \ 112 \ 113 dst += spill; \ 114 src += spill; \ 115 size -= spill; \ 116 offset += spill; \ 117 spill = 0; \ 118 } \ 119 if (!spill && !SPILL(src, _type)) { \ 120 size_t to_copy = size & ~(sizeof(_type) - 1); \ 121 \ 122 diff_offs = vmw_find_first_diff_ ## _type \ 123 ((_type *) dst, (_type *) src, to_copy); \ 124 if (diff_offs >= size || granularity == sizeof(_type)) \ 125 return (offset + diff_offs); \ 126 \ 127 dst += diff_offs; \ 128 src += diff_offs; \ 129 size -= diff_offs; \ 130 offset += diff_offs; \ 131 } \ 132 } while (0) \ 133 134 135 /** 136 * vmw_find_first_diff - find the first difference between dst and src 137 * 138 * @dst: The destination address 139 * @src: The source address 140 * @size: Number of bytes to compare 141 * @granularity: The granularity needed for the return value in bytes. 142 * return: The offset from find start where the first difference was 143 * encountered in bytes. If no difference was found, the function returns 144 * a value >= @size. 145 */ 146 static size_t vmw_find_first_diff(const u8 *dst, const u8 *src, size_t size, 147 size_t granularity) 148 { 149 size_t offset = 0; 150 151 /* 152 * Try finding with large integers if alignment allows, or we can 153 * fix it. Fall through if we need better resolution or alignment 154 * was bad. 155 */ 156 #ifdef CONFIG_64BIT 157 VMW_TRY_FIND_FIRST_DIFF(u64); 158 #endif 159 VMW_TRY_FIND_FIRST_DIFF(u32); 160 VMW_TRY_FIND_FIRST_DIFF(u16); 161 162 return round_down(offset + vmw_find_first_diff_u8(dst, src, size), 163 granularity); 164 } 165 166 167 /* 168 * Template to compute find_last_diff() for a certain integer type 169 * including a tail copy for alignment, and adjustment of parameters 170 * for head find or increased resolution find using an unsigned integer find 171 * of smaller width. If finding is complete, and resolution is sufficient, 172 * the macro executes a return statement. Otherwise it falls through. 173 */ 174 #define VMW_TRY_FIND_LAST_DIFF(_type) \ 175 do { \ 176 unsigned int spill = SPILL(dst, _type); \ 177 ssize_t location; \ 178 ssize_t diff_offs; \ 179 \ 180 if (spill && spill <= size && spill == SPILL(src, _type)) { \ 181 diff_offs = vmw_find_last_diff_u8(dst, src, spill); \ 182 if (diff_offs) { \ 183 location = size - spill + diff_offs - 1; \ 184 return round_down(location, granularity); \ 185 } \ 186 \ 187 dst -= spill; \ 188 src -= spill; \ 189 size -= spill; \ 190 spill = 0; \ 191 } \ 192 if (!spill && !SPILL(src, _type)) { \ 193 size_t to_copy = round_down(size, sizeof(_type)); \ 194 \ 195 diff_offs = vmw_find_last_diff_ ## _type \ 196 ((_type *) dst, (_type *) src, to_copy); \ 197 location = size - to_copy + diff_offs - sizeof(_type); \ 198 if (location < 0 || granularity == sizeof(_type)) \ 199 return location; \ 200 \ 201 dst -= to_copy - diff_offs; \ 202 src -= to_copy - diff_offs; \ 203 size -= to_copy - diff_offs; \ 204 } \ 205 } while (0) 206 207 208 /** 209 * vmw_find_last_diff - find the last difference between dst and src 210 * 211 * @dst: The destination address 212 * @src: The source address 213 * @size: Number of bytes to compare 214 * @granularity: The granularity needed for the return value in bytes. 215 * return: The offset from find start where the last difference was 216 * encountered in bytes, or a negative value if no difference was found. 217 */ 218 static ssize_t vmw_find_last_diff(const u8 *dst, const u8 *src, size_t size, 219 size_t granularity) 220 { 221 dst += size; 222 src += size; 223 224 #ifdef CONFIG_64BIT 225 VMW_TRY_FIND_LAST_DIFF(u64); 226 #endif 227 VMW_TRY_FIND_LAST_DIFF(u32); 228 VMW_TRY_FIND_LAST_DIFF(u16); 229 230 return round_down(vmw_find_last_diff_u8(dst, src, size) - 1, 231 granularity); 232 } 233 234 235 /** 236 * vmw_memcpy - A wrapper around kernel memcpy with allowing to plug it into a 237 * struct vmw_diff_cpy. 238 * 239 * @diff: The struct vmw_diff_cpy closure argument (unused). 240 * @dest: The copy destination. 241 * @src: The copy source. 242 * @n: Number of bytes to copy. 243 */ 244 void vmw_memcpy(struct vmw_diff_cpy *diff, u8 *dest, const u8 *src, size_t n) 245 { 246 memcpy(dest, src, n); 247 } 248 249 250 /** 251 * vmw_adjust_rect - Adjust rectangle coordinates for newly found difference 252 * 253 * @diff: The struct vmw_diff_cpy used to track the modified bounding box. 254 * @diff_offs: The offset from @diff->line_offset where the difference was 255 * found. 256 */ 257 static void vmw_adjust_rect(struct vmw_diff_cpy *diff, size_t diff_offs) 258 { 259 size_t offs = (diff_offs + diff->line_offset) / diff->cpp; 260 struct drm_rect *rect = &diff->rect; 261 262 rect->x1 = min_t(int, rect->x1, offs); 263 rect->x2 = max_t(int, rect->x2, offs + 1); 264 rect->y1 = min_t(int, rect->y1, diff->line); 265 rect->y2 = max_t(int, rect->y2, diff->line + 1); 266 } 267 268 /** 269 * vmw_diff_memcpy - memcpy that creates a bounding box of modified content. 270 * 271 * @diff: The struct vmw_diff_cpy used to track the modified bounding box. 272 * @dest: The copy destination. 273 * @src: The copy source. 274 * @n: Number of bytes to copy. 275 * 276 * In order to correctly track the modified content, the field @diff->line must 277 * be pre-loaded with the current line number, the field @diff->line_offset must 278 * be pre-loaded with the line offset in bytes where the copy starts, and 279 * finally the field @diff->cpp need to be preloaded with the number of bytes 280 * per unit in the horizontal direction of the area we're examining. 281 * Typically bytes per pixel. 282 * This is needed to know the needed granularity of the difference computing 283 * operations. A higher cpp generally leads to faster execution at the cost of 284 * bounding box width precision. 285 */ 286 void vmw_diff_memcpy(struct vmw_diff_cpy *diff, u8 *dest, const u8 *src, 287 size_t n) 288 { 289 ssize_t csize, byte_len; 290 291 if (WARN_ON_ONCE(round_down(n, diff->cpp) != n)) 292 return; 293 294 /* TODO: Possibly use a single vmw_find_first_diff per line? */ 295 csize = vmw_find_first_diff(dest, src, n, diff->cpp); 296 if (csize < n) { 297 vmw_adjust_rect(diff, csize); 298 byte_len = diff->cpp; 299 300 /* 301 * Starting from where first difference was found, find 302 * location of last difference, and then copy. 303 */ 304 diff->line_offset += csize; 305 dest += csize; 306 src += csize; 307 n -= csize; 308 csize = vmw_find_last_diff(dest, src, n, diff->cpp); 309 if (csize >= 0) { 310 byte_len += csize; 311 vmw_adjust_rect(diff, csize); 312 } 313 memcpy(dest, src, byte_len); 314 } 315 diff->line_offset += n; 316 } 317 318 /** 319 * struct vmw_bo_blit_line_data - Convenience argument to vmw_bo_cpu_blit_line 320 * 321 * @mapped_dst: Already mapped destination page index in @dst_pages. 322 * @dst_addr: Kernel virtual address of mapped destination page. 323 * @dst_pages: Array of destination bo pages. 324 * @dst_num_pages: Number of destination bo pages. 325 * @dst_prot: Destination bo page protection. 326 * @mapped_src: Already mapped source page index in @dst_pages. 327 * @src_addr: Kernel virtual address of mapped source page. 328 * @src_pages: Array of source bo pages. 329 * @src_num_pages: Number of source bo pages. 330 * @src_prot: Source bo page protection. 331 * @diff: Struct vmw_diff_cpy, in the end forwarded to the memcpy routine. 332 */ 333 struct vmw_bo_blit_line_data { 334 u32 mapped_dst; 335 u8 *dst_addr; 336 struct page **dst_pages; 337 u32 dst_num_pages; 338 pgprot_t dst_prot; 339 u32 mapped_src; 340 u8 *src_addr; 341 struct page **src_pages; 342 u32 src_num_pages; 343 pgprot_t src_prot; 344 struct vmw_diff_cpy *diff; 345 }; 346 347 /** 348 * vmw_bo_cpu_blit_line - Blit part of a line from one bo to another. 349 * 350 * @d: Blit data as described above. 351 * @dst_offset: Destination copy start offset from start of bo. 352 * @src_offset: Source copy start offset from start of bo. 353 * @bytes_to_copy: Number of bytes to copy in this line. 354 */ 355 static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, 356 u32 dst_offset, 357 u32 src_offset, 358 u32 bytes_to_copy) 359 { 360 struct vmw_diff_cpy *diff = d->diff; 361 362 while (bytes_to_copy) { 363 u32 copy_size = bytes_to_copy; 364 u32 dst_page = dst_offset >> PAGE_SHIFT; 365 u32 src_page = src_offset >> PAGE_SHIFT; 366 u32 dst_page_offset = dst_offset & ~PAGE_MASK; 367 u32 src_page_offset = src_offset & ~PAGE_MASK; 368 bool unmap_dst = d->dst_addr && dst_page != d->mapped_dst; 369 bool unmap_src = d->src_addr && (src_page != d->mapped_src || 370 unmap_dst); 371 372 copy_size = min_t(u32, copy_size, PAGE_SIZE - dst_page_offset); 373 copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset); 374 375 if (unmap_src) { 376 ttm_kunmap_atomic_prot(d->src_addr, d->src_prot); 377 d->src_addr = NULL; 378 } 379 380 if (unmap_dst) { 381 ttm_kunmap_atomic_prot(d->dst_addr, d->dst_prot); 382 d->dst_addr = NULL; 383 } 384 385 if (!d->dst_addr) { 386 if (WARN_ON_ONCE(dst_page >= d->dst_num_pages)) 387 return -EINVAL; 388 389 d->dst_addr = 390 ttm_kmap_atomic_prot(d->dst_pages[dst_page], 391 d->dst_prot); 392 if (!d->dst_addr) 393 return -ENOMEM; 394 395 d->mapped_dst = dst_page; 396 } 397 398 if (!d->src_addr) { 399 if (WARN_ON_ONCE(src_page >= d->src_num_pages)) 400 return -EINVAL; 401 402 d->src_addr = 403 ttm_kmap_atomic_prot(d->src_pages[src_page], 404 d->src_prot); 405 if (!d->src_addr) 406 return -ENOMEM; 407 408 d->mapped_src = src_page; 409 } 410 diff->do_cpy(diff, d->dst_addr + dst_page_offset, 411 d->src_addr + src_page_offset, copy_size); 412 413 bytes_to_copy -= copy_size; 414 dst_offset += copy_size; 415 src_offset += copy_size; 416 } 417 418 return 0; 419 } 420 421 /** 422 * ttm_bo_cpu_blit - in-kernel cpu blit. 423 * 424 * @dst: Destination buffer object. 425 * @dst_offset: Destination offset of blit start in bytes. 426 * @dst_stride: Destination stride in bytes. 427 * @src: Source buffer object. 428 * @src_offset: Source offset of blit start in bytes. 429 * @src_stride: Source stride in bytes. 430 * @w: Width of blit. 431 * @h: Height of blit. 432 * return: Zero on success. Negative error value on failure. Will print out 433 * kernel warnings on caller bugs. 434 * 435 * Performs a CPU blit from one buffer object to another avoiding a full 436 * bo vmap which may exhaust- or fragment vmalloc space. 437 * On supported architectures (x86), we're using kmap_atomic which avoids 438 * cross-processor TLB- and cache flushes and may, on non-HIGHMEM systems 439 * reference already set-up mappings. 440 * 441 * Neither of the buffer objects may be placed in PCI memory 442 * (Fixed memory in TTM terminology) when using this function. 443 */ 444 int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, 445 u32 dst_offset, u32 dst_stride, 446 struct ttm_buffer_object *src, 447 u32 src_offset, u32 src_stride, 448 u32 w, u32 h, 449 struct vmw_diff_cpy *diff) 450 { 451 struct ttm_operation_ctx ctx = { 452 .interruptible = false, 453 .no_wait_gpu = false 454 }; 455 u32 j, initial_line = dst_offset / dst_stride; 456 struct vmw_bo_blit_line_data d; 457 int ret = 0; 458 459 /* Buffer objects need to be either pinned or reserved: */ 460 if (!(dst->mem.placement & TTM_PL_FLAG_NO_EVICT)) 461 lockdep_assert_held(&dst->resv->lock.base); 462 if (!(src->mem.placement & TTM_PL_FLAG_NO_EVICT)) 463 lockdep_assert_held(&src->resv->lock.base); 464 465 if (dst->ttm->state == tt_unpopulated) { 466 ret = dst->ttm->bdev->driver->ttm_tt_populate(dst->ttm, &ctx); 467 if (ret) 468 return ret; 469 } 470 471 if (src->ttm->state == tt_unpopulated) { 472 ret = src->ttm->bdev->driver->ttm_tt_populate(src->ttm, &ctx); 473 if (ret) 474 return ret; 475 } 476 477 d.mapped_dst = 0; 478 d.mapped_src = 0; 479 d.dst_addr = NULL; 480 d.src_addr = NULL; 481 d.dst_pages = dst->ttm->pages; 482 d.src_pages = src->ttm->pages; 483 d.dst_num_pages = dst->num_pages; 484 d.src_num_pages = src->num_pages; 485 d.dst_prot = ttm_io_prot(dst->mem.placement, PAGE_KERNEL); 486 d.src_prot = ttm_io_prot(src->mem.placement, PAGE_KERNEL); 487 d.diff = diff; 488 489 for (j = 0; j < h; ++j) { 490 diff->line = j + initial_line; 491 diff->line_offset = dst_offset % dst_stride; 492 ret = vmw_bo_cpu_blit_line(&d, dst_offset, src_offset, w); 493 if (ret) 494 goto out; 495 496 dst_offset += dst_stride; 497 src_offset += src_stride; 498 } 499 out: 500 if (d.src_addr) 501 ttm_kunmap_atomic_prot(d.src_addr, d.src_prot); 502 if (d.dst_addr) 503 ttm_kunmap_atomic_prot(d.dst_addr, d.dst_prot); 504 505 return ret; 506 } 507