1 /* 2 * Copyright (c) 2008 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * Keith Packard <keithp@keithp.com> 26 * Mika Kuoppala <mika.kuoppala@intel.com> 27 * 28 */ 29 30 #include <linux/ascii85.h> 31 #include <linux/nmi.h> 32 #include <linux/pagevec.h> 33 #include <linux/scatterlist.h> 34 #include <linux/utsname.h> 35 #include <linux/zlib.h> 36 37 #include <drm/drm_print.h> 38 39 #include "display/intel_atomic.h" 40 #include "display/intel_overlay.h" 41 42 #include "gem/i915_gem_context.h" 43 44 #include "i915_drv.h" 45 #include "i915_gpu_error.h" 46 #include "i915_memcpy.h" 47 #include "i915_scatterlist.h" 48 #include "intel_csr.h" 49 50 #define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) 51 #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN) 52 53 static void __sg_set_buf(struct scatterlist *sg, 54 void *addr, unsigned int len, loff_t it) 55 { 56 sg->page_link = (unsigned long)virt_to_page(addr); 57 sg->offset = offset_in_page(addr); 58 sg->length = len; 59 sg->dma_address = it; 60 } 61 62 static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) 63 { 64 if (!len) 65 return false; 66 67 if (e->bytes + len + 1 <= e->size) 68 return true; 69 70 if (e->bytes) { 71 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter); 72 e->iter += e->bytes; 73 e->buf = NULL; 74 e->bytes = 0; 75 } 76 77 if (e->cur == e->end) { 78 struct scatterlist *sgl; 79 80 sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL); 81 if (!sgl) { 82 e->err = -ENOMEM; 83 return false; 84 } 85 86 if (e->cur) { 87 e->cur->offset = 0; 88 e->cur->length = 0; 89 e->cur->page_link = 90 (unsigned long)sgl | SG_CHAIN; 91 } else { 92 e->sgl = sgl; 93 } 94 95 e->cur = sgl; 96 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1; 97 } 98 99 e->size = ALIGN(len + 1, SZ_64K); 100 e->buf = kmalloc(e->size, ALLOW_FAIL); 101 if (!e->buf) { 102 e->size = PAGE_ALIGN(len + 1); 103 e->buf = kmalloc(e->size, GFP_KERNEL); 104 } 105 if (!e->buf) { 106 e->err = -ENOMEM; 107 return false; 108 } 109 110 return true; 111 } 112 113 __printf(2, 0) 114 static void i915_error_vprintf(struct drm_i915_error_state_buf *e, 115 const char *fmt, va_list args) 116 { 117 va_list ap; 118 int len; 119 120 if (e->err) 121 return; 122 123 va_copy(ap, args); 124 len = vsnprintf(NULL, 0, fmt, ap); 125 va_end(ap); 126 if (len <= 0) { 127 e->err = len; 128 return; 129 } 130 131 if (!__i915_error_grow(e, len)) 132 return; 133 134 GEM_BUG_ON(e->bytes >= e->size); 135 len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args); 136 if (len < 0) { 137 e->err = len; 138 return; 139 } 140 e->bytes += len; 141 } 142 143 static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str) 144 { 145 unsigned len; 146 147 if (e->err || !str) 148 return; 149 150 len = strlen(str); 151 if (!__i915_error_grow(e, len)) 152 return; 153 154 GEM_BUG_ON(e->bytes + len > e->size); 155 memcpy(e->buf + e->bytes, str, len); 156 e->bytes += len; 157 } 158 159 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__) 160 #define err_puts(e, s) i915_error_puts(e, s) 161 162 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf) 163 { 164 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va); 165 } 166 167 static inline struct drm_printer 168 i915_error_printer(struct drm_i915_error_state_buf *e) 169 { 170 struct drm_printer p = { 171 .printfn = __i915_printfn_error, 172 .arg = e, 173 }; 174 return p; 175 } 176 177 /* single threaded page allocator with a reserved stash for emergencies */ 178 static void pool_fini(struct pagevec *pv) 179 { 180 pagevec_release(pv); 181 } 182 183 static int pool_refill(struct pagevec *pv, gfp_t gfp) 184 { 185 while (pagevec_space(pv)) { 186 struct page *p; 187 188 p = alloc_page(gfp); 189 if (!p) 190 return -ENOMEM; 191 192 pagevec_add(pv, p); 193 } 194 195 return 0; 196 } 197 198 static int pool_init(struct pagevec *pv, gfp_t gfp) 199 { 200 int err; 201 202 pagevec_init(pv); 203 204 err = pool_refill(pv, gfp); 205 if (err) 206 pool_fini(pv); 207 208 return err; 209 } 210 211 static void *pool_alloc(struct pagevec *pv, gfp_t gfp) 212 { 213 struct page *p; 214 215 p = alloc_page(gfp); 216 if (!p && pagevec_count(pv)) 217 p = pv->pages[--pv->nr]; 218 219 return p ? page_address(p) : NULL; 220 } 221 222 static void pool_free(struct pagevec *pv, void *addr) 223 { 224 struct page *p = virt_to_page(addr); 225 226 if (pagevec_space(pv)) 227 pagevec_add(pv, p); 228 else 229 __free_page(p); 230 } 231 232 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR 233 234 struct compress { 235 struct pagevec pool; 236 struct z_stream_s zstream; 237 void *tmp; 238 }; 239 240 static bool compress_init(struct compress *c) 241 { 242 struct z_stream_s *zstream = &c->zstream; 243 244 if (pool_init(&c->pool, ALLOW_FAIL)) 245 return false; 246 247 zstream->workspace = 248 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), 249 ALLOW_FAIL); 250 if (!zstream->workspace) { 251 pool_fini(&c->pool); 252 return false; 253 } 254 255 c->tmp = NULL; 256 if (i915_has_memcpy_from_wc()) 257 c->tmp = pool_alloc(&c->pool, ALLOW_FAIL); 258 259 return true; 260 } 261 262 static bool compress_start(struct compress *c) 263 { 264 struct z_stream_s *zstream = &c->zstream; 265 void *workspace = zstream->workspace; 266 267 memset(zstream, 0, sizeof(*zstream)); 268 zstream->workspace = workspace; 269 270 return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK; 271 } 272 273 static void *compress_next_page(struct compress *c, 274 struct drm_i915_error_object *dst) 275 { 276 void *page; 277 278 if (dst->page_count >= dst->num_pages) 279 return ERR_PTR(-ENOSPC); 280 281 page = pool_alloc(&c->pool, ALLOW_FAIL); 282 if (!page) 283 return ERR_PTR(-ENOMEM); 284 285 return dst->pages[dst->page_count++] = page; 286 } 287 288 static int compress_page(struct compress *c, 289 void *src, 290 struct drm_i915_error_object *dst) 291 { 292 struct z_stream_s *zstream = &c->zstream; 293 294 zstream->next_in = src; 295 if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) 296 zstream->next_in = c->tmp; 297 zstream->avail_in = PAGE_SIZE; 298 299 do { 300 if (zstream->avail_out == 0) { 301 zstream->next_out = compress_next_page(c, dst); 302 if (IS_ERR(zstream->next_out)) 303 return PTR_ERR(zstream->next_out); 304 305 zstream->avail_out = PAGE_SIZE; 306 } 307 308 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) 309 return -EIO; 310 } while (zstream->avail_in); 311 312 /* Fallback to uncompressed if we increase size? */ 313 if (0 && zstream->total_out > zstream->total_in) 314 return -E2BIG; 315 316 return 0; 317 } 318 319 static int compress_flush(struct compress *c, 320 struct drm_i915_error_object *dst) 321 { 322 struct z_stream_s *zstream = &c->zstream; 323 324 do { 325 switch (zlib_deflate(zstream, Z_FINISH)) { 326 case Z_OK: /* more space requested */ 327 zstream->next_out = compress_next_page(c, dst); 328 if (IS_ERR(zstream->next_out)) 329 return PTR_ERR(zstream->next_out); 330 331 zstream->avail_out = PAGE_SIZE; 332 break; 333 334 case Z_STREAM_END: 335 goto end; 336 337 default: /* any error */ 338 return -EIO; 339 } 340 } while (1); 341 342 end: 343 memset(zstream->next_out, 0, zstream->avail_out); 344 dst->unused = zstream->avail_out; 345 return 0; 346 } 347 348 static void compress_finish(struct compress *c) 349 { 350 zlib_deflateEnd(&c->zstream); 351 } 352 353 static void compress_fini(struct compress *c) 354 { 355 kfree(c->zstream.workspace); 356 if (c->tmp) 357 pool_free(&c->pool, c->tmp); 358 pool_fini(&c->pool); 359 } 360 361 static void err_compression_marker(struct drm_i915_error_state_buf *m) 362 { 363 err_puts(m, ":"); 364 } 365 366 #else 367 368 struct compress { 369 struct pagevec pool; 370 }; 371 372 static bool compress_init(struct compress *c) 373 { 374 return pool_init(&c->pool, ALLOW_FAIL) == 0; 375 } 376 377 static bool compress_start(struct compress *c) 378 { 379 return true; 380 } 381 382 static int compress_page(struct compress *c, 383 void *src, 384 struct drm_i915_error_object *dst) 385 { 386 void *ptr; 387 388 ptr = pool_alloc(&c->pool, ALLOW_FAIL); 389 if (!ptr) 390 return -ENOMEM; 391 392 if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE)) 393 memcpy(ptr, src, PAGE_SIZE); 394 dst->pages[dst->page_count++] = ptr; 395 396 return 0; 397 } 398 399 static int compress_flush(struct compress *c, 400 struct drm_i915_error_object *dst) 401 { 402 return 0; 403 } 404 405 static void compress_finish(struct compress *c) 406 { 407 } 408 409 static void compress_fini(struct compress *c) 410 { 411 pool_fini(&c->pool); 412 } 413 414 static void err_compression_marker(struct drm_i915_error_state_buf *m) 415 { 416 err_puts(m, "~"); 417 } 418 419 #endif 420 421 static void error_print_instdone(struct drm_i915_error_state_buf *m, 422 const struct drm_i915_error_engine *ee) 423 { 424 int slice; 425 int subslice; 426 427 err_printf(m, " INSTDONE: 0x%08x\n", 428 ee->instdone.instdone); 429 430 if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3) 431 return; 432 433 err_printf(m, " SC_INSTDONE: 0x%08x\n", 434 ee->instdone.slice_common); 435 436 if (INTEL_GEN(m->i915) <= 6) 437 return; 438 439 for_each_instdone_slice_subslice(m->i915, slice, subslice) 440 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n", 441 slice, subslice, 442 ee->instdone.sampler[slice][subslice]); 443 444 for_each_instdone_slice_subslice(m->i915, slice, subslice) 445 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n", 446 slice, subslice, 447 ee->instdone.row[slice][subslice]); 448 } 449 450 static void error_print_request(struct drm_i915_error_state_buf *m, 451 const char *prefix, 452 const struct drm_i915_error_request *erq, 453 const unsigned long epoch) 454 { 455 if (!erq->seqno) 456 return; 457 458 err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", 459 prefix, erq->pid, erq->context, erq->seqno, 460 test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 461 &erq->flags) ? "!" : "", 462 test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 463 &erq->flags) ? "+" : "", 464 erq->sched_attr.priority, 465 jiffies_to_msecs(erq->jiffies - epoch), 466 erq->start, erq->head, erq->tail); 467 } 468 469 static void error_print_context(struct drm_i915_error_state_buf *m, 470 const char *header, 471 const struct drm_i915_error_context *ctx) 472 { 473 err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n", 474 header, ctx->comm, ctx->pid, ctx->hw_id, 475 ctx->sched_attr.priority, ctx->guilty, ctx->active); 476 } 477 478 static void error_print_engine(struct drm_i915_error_state_buf *m, 479 const struct drm_i915_error_engine *ee, 480 const unsigned long epoch) 481 { 482 int n; 483 484 err_printf(m, "%s command stream:\n", ee->engine->name); 485 err_printf(m, " IDLE?: %s\n", yesno(ee->idle)); 486 err_printf(m, " START: 0x%08x\n", ee->start); 487 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head); 488 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n", 489 ee->tail, ee->rq_post, ee->rq_tail); 490 err_printf(m, " CTL: 0x%08x\n", ee->ctl); 491 err_printf(m, " MODE: 0x%08x\n", ee->mode); 492 err_printf(m, " HWS: 0x%08x\n", ee->hws); 493 err_printf(m, " ACTHD: 0x%08x %08x\n", 494 (u32)(ee->acthd>>32), (u32)ee->acthd); 495 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir); 496 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr); 497 498 error_print_instdone(m, ee); 499 500 if (ee->batchbuffer) { 501 u64 start = ee->batchbuffer->gtt_offset; 502 u64 end = start + ee->batchbuffer->gtt_size; 503 504 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n", 505 upper_32_bits(start), lower_32_bits(start), 506 upper_32_bits(end), lower_32_bits(end)); 507 } 508 if (INTEL_GEN(m->i915) >= 4) { 509 err_printf(m, " BBADDR: 0x%08x_%08x\n", 510 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr); 511 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate); 512 err_printf(m, " INSTPS: 0x%08x\n", ee->instps); 513 } 514 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm); 515 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr), 516 lower_32_bits(ee->faddr)); 517 if (INTEL_GEN(m->i915) >= 6) { 518 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi); 519 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg); 520 } 521 if (HAS_PPGTT(m->i915)) { 522 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode); 523 524 if (INTEL_GEN(m->i915) >= 8) { 525 int i; 526 for (i = 0; i < 4; i++) 527 err_printf(m, " PDP%d: 0x%016llx\n", 528 i, ee->vm_info.pdp[i]); 529 } else { 530 err_printf(m, " PP_DIR_BASE: 0x%08x\n", 531 ee->vm_info.pp_dir_base); 532 } 533 } 534 err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); 535 err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); 536 err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n", 537 jiffies_to_msecs(ee->hangcheck_timestamp - epoch), 538 ee->hangcheck_timestamp, 539 ee->hangcheck_timestamp == epoch ? "; epoch" : ""); 540 err_printf(m, " engine reset count: %u\n", ee->reset_count); 541 542 for (n = 0; n < ee->num_ports; n++) { 543 err_printf(m, " ELSP[%d]:", n); 544 error_print_request(m, " ", &ee->execlist[n], epoch); 545 } 546 547 error_print_context(m, " Active context: ", &ee->context); 548 } 549 550 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) 551 { 552 va_list args; 553 554 va_start(args, f); 555 i915_error_vprintf(e, f, args); 556 va_end(args); 557 } 558 559 static void print_error_obj(struct drm_i915_error_state_buf *m, 560 const struct intel_engine_cs *engine, 561 const char *name, 562 const struct drm_i915_error_object *obj) 563 { 564 char out[ASCII85_BUFSZ]; 565 int page; 566 567 if (!obj) 568 return; 569 570 if (name) { 571 err_printf(m, "%s --- %s = 0x%08x %08x\n", 572 engine ? engine->name : "global", name, 573 upper_32_bits(obj->gtt_offset), 574 lower_32_bits(obj->gtt_offset)); 575 } 576 577 err_compression_marker(m); 578 for (page = 0; page < obj->page_count; page++) { 579 int i, len; 580 581 len = PAGE_SIZE; 582 if (page == obj->page_count - 1) 583 len -= obj->unused; 584 len = ascii85_encode_len(len); 585 586 for (i = 0; i < len; i++) 587 err_puts(m, ascii85_encode(obj->pages[page][i], out)); 588 } 589 err_puts(m, "\n"); 590 } 591 592 static void err_print_capabilities(struct drm_i915_error_state_buf *m, 593 const struct intel_device_info *info, 594 const struct intel_runtime_info *runtime, 595 const struct intel_driver_caps *caps) 596 { 597 struct drm_printer p = i915_error_printer(m); 598 599 intel_device_info_dump_flags(info, &p); 600 intel_driver_caps_print(caps, &p); 601 intel_device_info_dump_topology(&runtime->sseu, &p); 602 } 603 604 static void err_print_params(struct drm_i915_error_state_buf *m, 605 const struct i915_params *params) 606 { 607 struct drm_printer p = i915_error_printer(m); 608 609 i915_params_dump(params, &p); 610 } 611 612 static void err_print_pciid(struct drm_i915_error_state_buf *m, 613 struct drm_i915_private *i915) 614 { 615 struct pci_dev *pdev = i915->drm.pdev; 616 617 err_printf(m, "PCI ID: 0x%04x\n", pdev->device); 618 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision); 619 err_printf(m, "PCI Subsystem: %04x:%04x\n", 620 pdev->subsystem_vendor, 621 pdev->subsystem_device); 622 } 623 624 static void err_print_uc(struct drm_i915_error_state_buf *m, 625 const struct i915_error_uc *error_uc) 626 { 627 struct drm_printer p = i915_error_printer(m); 628 const struct i915_gpu_state *error = 629 container_of(error_uc, typeof(*error), uc); 630 631 if (!error->device_info.has_gt_uc) 632 return; 633 634 intel_uc_fw_dump(&error_uc->guc_fw, &p); 635 intel_uc_fw_dump(&error_uc->huc_fw, &p); 636 print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log); 637 } 638 639 static void err_free_sgl(struct scatterlist *sgl) 640 { 641 while (sgl) { 642 struct scatterlist *sg; 643 644 for (sg = sgl; !sg_is_chain(sg); sg++) { 645 kfree(sg_virt(sg)); 646 if (sg_is_last(sg)) 647 break; 648 } 649 650 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg); 651 free_page((unsigned long)sgl); 652 sgl = sg; 653 } 654 } 655 656 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, 657 struct i915_gpu_state *error) 658 { 659 const struct drm_i915_error_engine *ee; 660 struct timespec64 ts; 661 int i, j; 662 663 if (*error->error_msg) 664 err_printf(m, "%s\n", error->error_msg); 665 err_printf(m, "Kernel: %s %s\n", 666 init_utsname()->release, 667 init_utsname()->machine); 668 err_printf(m, "Driver: %s\n", DRIVER_DATE); 669 ts = ktime_to_timespec64(error->time); 670 err_printf(m, "Time: %lld s %ld us\n", 671 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 672 ts = ktime_to_timespec64(error->boottime); 673 err_printf(m, "Boottime: %lld s %ld us\n", 674 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 675 ts = ktime_to_timespec64(error->uptime); 676 err_printf(m, "Uptime: %lld s %ld us\n", 677 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 678 err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ); 679 err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n", 680 error->capture, 681 jiffies_to_msecs(jiffies - error->capture), 682 jiffies_to_msecs(error->capture - error->epoch)); 683 684 for (ee = error->engine; ee; ee = ee->next) 685 err_printf(m, "Active process (on ring %s): %s [%d]\n", 686 ee->engine->name, 687 ee->context.comm, 688 ee->context.pid); 689 690 err_printf(m, "Reset count: %u\n", error->reset_count); 691 err_printf(m, "Suspend count: %u\n", error->suspend_count); 692 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); 693 err_printf(m, "Subplatform: 0x%x\n", 694 intel_subplatform(&error->runtime_info, 695 error->device_info.platform)); 696 err_print_pciid(m, m->i915); 697 698 err_printf(m, "IOMMU enabled?: %d\n", error->iommu); 699 700 if (HAS_CSR(m->i915)) { 701 struct intel_csr *csr = &m->i915->csr; 702 703 err_printf(m, "DMC loaded: %s\n", 704 yesno(csr->dmc_payload != NULL)); 705 err_printf(m, "DMC fw version: %d.%d\n", 706 CSR_VERSION_MAJOR(csr->version), 707 CSR_VERSION_MINOR(csr->version)); 708 } 709 710 err_printf(m, "GT awake: %s\n", yesno(error->awake)); 711 err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock)); 712 err_printf(m, "PM suspended: %s\n", yesno(error->suspended)); 713 err_printf(m, "EIR: 0x%08x\n", error->eir); 714 err_printf(m, "IER: 0x%08x\n", error->ier); 715 for (i = 0; i < error->ngtier; i++) 716 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); 717 err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); 718 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); 719 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); 720 err_printf(m, "CCID: 0x%08x\n", error->ccid); 721 722 for (i = 0; i < error->nfence; i++) 723 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); 724 725 if (IS_GEN_RANGE(m->i915, 6, 11)) { 726 err_printf(m, "ERROR: 0x%08x\n", error->error); 727 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg); 728 } 729 730 if (INTEL_GEN(m->i915) >= 8) 731 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", 732 error->fault_data1, error->fault_data0); 733 734 if (IS_GEN(m->i915, 7)) 735 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); 736 737 for (ee = error->engine; ee; ee = ee->next) 738 error_print_engine(m, ee, error->epoch); 739 740 for (ee = error->engine; ee; ee = ee->next) { 741 const struct drm_i915_error_object *obj; 742 743 obj = ee->batchbuffer; 744 if (obj) { 745 err_puts(m, ee->engine->name); 746 if (ee->context.pid) 747 err_printf(m, " (submitted by %s [%d])", 748 ee->context.comm, 749 ee->context.pid); 750 err_printf(m, " --- gtt_offset = 0x%08x %08x\n", 751 upper_32_bits(obj->gtt_offset), 752 lower_32_bits(obj->gtt_offset)); 753 print_error_obj(m, ee->engine, NULL, obj); 754 } 755 756 for (j = 0; j < ee->user_bo_count; j++) 757 print_error_obj(m, ee->engine, "user", ee->user_bo[j]); 758 759 if (ee->num_requests) { 760 err_printf(m, "%s --- %d requests\n", 761 ee->engine->name, 762 ee->num_requests); 763 for (j = 0; j < ee->num_requests; j++) 764 error_print_request(m, " ", 765 &ee->requests[j], 766 error->epoch); 767 } 768 769 print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer); 770 print_error_obj(m, ee->engine, "HW Status", ee->hws_page); 771 print_error_obj(m, ee->engine, "HW context", ee->ctx); 772 print_error_obj(m, ee->engine, "WA context", ee->wa_ctx); 773 print_error_obj(m, ee->engine, 774 "WA batchbuffer", ee->wa_batchbuffer); 775 print_error_obj(m, ee->engine, 776 "NULL context", ee->default_state); 777 } 778 779 if (error->overlay) 780 intel_overlay_print_error_state(m, error->overlay); 781 782 if (error->display) 783 intel_display_print_error_state(m, error->display); 784 785 err_print_capabilities(m, &error->device_info, &error->runtime_info, 786 &error->driver_caps); 787 err_print_params(m, &error->params); 788 err_print_uc(m, &error->uc); 789 } 790 791 static int err_print_to_sgl(struct i915_gpu_state *error) 792 { 793 struct drm_i915_error_state_buf m; 794 795 if (IS_ERR(error)) 796 return PTR_ERR(error); 797 798 if (READ_ONCE(error->sgl)) 799 return 0; 800 801 memset(&m, 0, sizeof(m)); 802 m.i915 = error->i915; 803 804 __err_print_to_sgl(&m, error); 805 806 if (m.buf) { 807 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter); 808 m.bytes = 0; 809 m.buf = NULL; 810 } 811 if (m.cur) { 812 GEM_BUG_ON(m.end < m.cur); 813 sg_mark_end(m.cur - 1); 814 } 815 GEM_BUG_ON(m.sgl && !m.cur); 816 817 if (m.err) { 818 err_free_sgl(m.sgl); 819 return m.err; 820 } 821 822 if (cmpxchg(&error->sgl, NULL, m.sgl)) 823 err_free_sgl(m.sgl); 824 825 return 0; 826 } 827 828 ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, 829 char *buf, loff_t off, size_t rem) 830 { 831 struct scatterlist *sg; 832 size_t count; 833 loff_t pos; 834 int err; 835 836 if (!error || !rem) 837 return 0; 838 839 err = err_print_to_sgl(error); 840 if (err) 841 return err; 842 843 sg = READ_ONCE(error->fit); 844 if (!sg || off < sg->dma_address) 845 sg = error->sgl; 846 if (!sg) 847 return 0; 848 849 pos = sg->dma_address; 850 count = 0; 851 do { 852 size_t len, start; 853 854 if (sg_is_chain(sg)) { 855 sg = sg_chain_ptr(sg); 856 GEM_BUG_ON(sg_is_chain(sg)); 857 } 858 859 len = sg->length; 860 if (pos + len <= off) { 861 pos += len; 862 continue; 863 } 864 865 start = sg->offset; 866 if (pos < off) { 867 GEM_BUG_ON(off - pos > len); 868 len -= off - pos; 869 start += off - pos; 870 pos = off; 871 } 872 873 len = min(len, rem); 874 GEM_BUG_ON(!len || len > sg->length); 875 876 memcpy(buf, page_address(sg_page(sg)) + start, len); 877 878 count += len; 879 pos += len; 880 881 buf += len; 882 rem -= len; 883 if (!rem) { 884 WRITE_ONCE(error->fit, sg); 885 break; 886 } 887 } while (!sg_is_last(sg++)); 888 889 return count; 890 } 891 892 static void i915_error_object_free(struct drm_i915_error_object *obj) 893 { 894 int page; 895 896 if (obj == NULL) 897 return; 898 899 for (page = 0; page < obj->page_count; page++) 900 free_page((unsigned long)obj->pages[page]); 901 902 kfree(obj); 903 } 904 905 906 static void cleanup_params(struct i915_gpu_state *error) 907 { 908 i915_params_free(&error->params); 909 } 910 911 static void cleanup_uc_state(struct i915_gpu_state *error) 912 { 913 struct i915_error_uc *error_uc = &error->uc; 914 915 kfree(error_uc->guc_fw.path); 916 kfree(error_uc->huc_fw.path); 917 i915_error_object_free(error_uc->guc_log); 918 } 919 920 void __i915_gpu_state_free(struct kref *error_ref) 921 { 922 struct i915_gpu_state *error = 923 container_of(error_ref, typeof(*error), ref); 924 long i; 925 926 while (error->engine) { 927 struct drm_i915_error_engine *ee = error->engine; 928 929 error->engine = ee->next; 930 931 for (i = 0; i < ee->user_bo_count; i++) 932 i915_error_object_free(ee->user_bo[i]); 933 kfree(ee->user_bo); 934 935 i915_error_object_free(ee->batchbuffer); 936 i915_error_object_free(ee->wa_batchbuffer); 937 i915_error_object_free(ee->ringbuffer); 938 i915_error_object_free(ee->hws_page); 939 i915_error_object_free(ee->ctx); 940 i915_error_object_free(ee->wa_ctx); 941 942 kfree(ee->requests); 943 kfree(ee); 944 } 945 946 kfree(error->overlay); 947 kfree(error->display); 948 949 cleanup_params(error); 950 cleanup_uc_state(error); 951 952 err_free_sgl(error->sgl); 953 kfree(error); 954 } 955 956 static struct drm_i915_error_object * 957 i915_error_object_create(struct drm_i915_private *i915, 958 struct i915_vma *vma, 959 struct compress *compress) 960 { 961 struct i915_ggtt *ggtt = &i915->ggtt; 962 const u64 slot = ggtt->error_capture.start; 963 struct drm_i915_error_object *dst; 964 unsigned long num_pages; 965 struct sgt_iter iter; 966 dma_addr_t dma; 967 int ret; 968 969 might_sleep(); 970 971 if (!vma || !vma->pages) 972 return NULL; 973 974 num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; 975 num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */ 976 dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ALLOW_FAIL); 977 if (!dst) 978 return NULL; 979 980 if (!compress_start(compress)) { 981 kfree(dst); 982 return NULL; 983 } 984 985 dst->gtt_offset = vma->node.start; 986 dst->gtt_size = vma->node.size; 987 dst->num_pages = num_pages; 988 dst->page_count = 0; 989 dst->unused = 0; 990 991 ret = -EINVAL; 992 for_each_sgt_dma(dma, iter, vma->pages) { 993 void __iomem *s; 994 995 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); 996 997 s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE); 998 ret = compress_page(compress, (void __force *)s, dst); 999 io_mapping_unmap(s); 1000 if (ret) 1001 break; 1002 } 1003 1004 if (ret || compress_flush(compress, dst)) { 1005 while (dst->page_count--) 1006 pool_free(&compress->pool, dst->pages[dst->page_count]); 1007 kfree(dst); 1008 dst = NULL; 1009 } 1010 compress_finish(compress); 1011 1012 return dst; 1013 } 1014 1015 /* 1016 * Generate a semi-unique error code. The code is not meant to have meaning, The 1017 * code's only purpose is to try to prevent false duplicated bug reports by 1018 * grossly estimating a GPU error state. 1019 * 1020 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine 1021 * the hang if we could strip the GTT offset information from it. 1022 * 1023 * It's only a small step better than a random number in its current form. 1024 */ 1025 static u32 i915_error_generate_code(struct i915_gpu_state *error) 1026 { 1027 const struct drm_i915_error_engine *ee = error->engine; 1028 1029 /* 1030 * IPEHR would be an ideal way to detect errors, as it's the gross 1031 * measure of "the command that hung." However, has some very common 1032 * synchronization commands which almost always appear in the case 1033 * strictly a client bug. Use instdone to differentiate those some. 1034 */ 1035 return ee ? ee->ipehr ^ ee->instdone.instdone : 0; 1036 } 1037 1038 static void gem_record_fences(struct i915_gpu_state *error) 1039 { 1040 struct drm_i915_private *dev_priv = error->i915; 1041 struct intel_uncore *uncore = &dev_priv->uncore; 1042 int i; 1043 1044 if (INTEL_GEN(dev_priv) >= 6) { 1045 for (i = 0; i < dev_priv->ggtt.num_fences; i++) 1046 error->fence[i] = 1047 intel_uncore_read64(uncore, 1048 FENCE_REG_GEN6_LO(i)); 1049 } else if (INTEL_GEN(dev_priv) >= 4) { 1050 for (i = 0; i < dev_priv->ggtt.num_fences; i++) 1051 error->fence[i] = 1052 intel_uncore_read64(uncore, 1053 FENCE_REG_965_LO(i)); 1054 } else { 1055 for (i = 0; i < dev_priv->ggtt.num_fences; i++) 1056 error->fence[i] = 1057 intel_uncore_read(uncore, FENCE_REG(i)); 1058 } 1059 error->nfence = i; 1060 } 1061 1062 static void error_record_engine_registers(struct i915_gpu_state *error, 1063 struct intel_engine_cs *engine, 1064 struct drm_i915_error_engine *ee) 1065 { 1066 struct drm_i915_private *dev_priv = engine->i915; 1067 1068 if (INTEL_GEN(dev_priv) >= 6) { 1069 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); 1070 1071 if (INTEL_GEN(dev_priv) >= 12) 1072 ee->fault_reg = I915_READ(GEN12_RING_FAULT_REG); 1073 else if (INTEL_GEN(dev_priv) >= 8) 1074 ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG); 1075 else 1076 ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine); 1077 } 1078 1079 if (INTEL_GEN(dev_priv) >= 4) { 1080 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD); 1081 ee->ipeir = ENGINE_READ(engine, RING_IPEIR); 1082 ee->ipehr = ENGINE_READ(engine, RING_IPEHR); 1083 ee->instps = ENGINE_READ(engine, RING_INSTPS); 1084 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR); 1085 if (INTEL_GEN(dev_priv) >= 8) { 1086 ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32; 1087 ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32; 1088 } 1089 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE); 1090 } else { 1091 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX); 1092 ee->ipeir = ENGINE_READ(engine, IPEIR); 1093 ee->ipehr = ENGINE_READ(engine, IPEHR); 1094 } 1095 1096 intel_engine_get_instdone(engine, &ee->instdone); 1097 1098 ee->instpm = ENGINE_READ(engine, RING_INSTPM); 1099 ee->acthd = intel_engine_get_active_head(engine); 1100 ee->start = ENGINE_READ(engine, RING_START); 1101 ee->head = ENGINE_READ(engine, RING_HEAD); 1102 ee->tail = ENGINE_READ(engine, RING_TAIL); 1103 ee->ctl = ENGINE_READ(engine, RING_CTL); 1104 if (INTEL_GEN(dev_priv) > 2) 1105 ee->mode = ENGINE_READ(engine, RING_MI_MODE); 1106 1107 if (!HWS_NEEDS_PHYSICAL(dev_priv)) { 1108 i915_reg_t mmio; 1109 1110 if (IS_GEN(dev_priv, 7)) { 1111 switch (engine->id) { 1112 default: 1113 MISSING_CASE(engine->id); 1114 /* fall through */ 1115 case RCS0: 1116 mmio = RENDER_HWS_PGA_GEN7; 1117 break; 1118 case BCS0: 1119 mmio = BLT_HWS_PGA_GEN7; 1120 break; 1121 case VCS0: 1122 mmio = BSD_HWS_PGA_GEN7; 1123 break; 1124 case VECS0: 1125 mmio = VEBOX_HWS_PGA_GEN7; 1126 break; 1127 } 1128 } else if (IS_GEN(engine->i915, 6)) { 1129 mmio = RING_HWS_PGA_GEN6(engine->mmio_base); 1130 } else { 1131 /* XXX: gen8 returns to sanity */ 1132 mmio = RING_HWS_PGA(engine->mmio_base); 1133 } 1134 1135 ee->hws = I915_READ(mmio); 1136 } 1137 1138 ee->idle = intel_engine_is_idle(engine); 1139 if (!ee->idle) 1140 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; 1141 ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, 1142 engine); 1143 1144 if (HAS_PPGTT(dev_priv)) { 1145 int i; 1146 1147 ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7); 1148 1149 if (IS_GEN(dev_priv, 6)) { 1150 ee->vm_info.pp_dir_base = 1151 ENGINE_READ(engine, RING_PP_DIR_BASE_READ); 1152 } else if (IS_GEN(dev_priv, 7)) { 1153 ee->vm_info.pp_dir_base = 1154 ENGINE_READ(engine, RING_PP_DIR_BASE); 1155 } else if (INTEL_GEN(dev_priv) >= 8) { 1156 u32 base = engine->mmio_base; 1157 1158 for (i = 0; i < 4; i++) { 1159 ee->vm_info.pdp[i] = 1160 I915_READ(GEN8_RING_PDP_UDW(base, i)); 1161 ee->vm_info.pdp[i] <<= 32; 1162 ee->vm_info.pdp[i] |= 1163 I915_READ(GEN8_RING_PDP_LDW(base, i)); 1164 } 1165 } 1166 } 1167 } 1168 1169 static void record_request(const struct i915_request *request, 1170 struct drm_i915_error_request *erq) 1171 { 1172 const struct i915_gem_context *ctx = request->gem_context; 1173 1174 erq->flags = request->fence.flags; 1175 erq->context = request->fence.context; 1176 erq->seqno = request->fence.seqno; 1177 erq->sched_attr = request->sched.attr; 1178 erq->jiffies = request->emitted_jiffies; 1179 erq->start = i915_ggtt_offset(request->ring->vma); 1180 erq->head = request->head; 1181 erq->tail = request->tail; 1182 1183 rcu_read_lock(); 1184 erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0; 1185 rcu_read_unlock(); 1186 } 1187 1188 static void engine_record_requests(struct intel_engine_cs *engine, 1189 struct i915_request *first, 1190 struct drm_i915_error_engine *ee) 1191 { 1192 struct i915_request *request; 1193 int count; 1194 1195 count = 0; 1196 request = first; 1197 list_for_each_entry_from(request, &engine->active.requests, sched.link) 1198 count++; 1199 if (!count) 1200 return; 1201 1202 ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL); 1203 if (!ee->requests) 1204 return; 1205 1206 ee->num_requests = count; 1207 1208 count = 0; 1209 request = first; 1210 list_for_each_entry_from(request, 1211 &engine->active.requests, sched.link) { 1212 if (count >= ee->num_requests) { 1213 /* 1214 * If the ring request list was changed in 1215 * between the point where the error request 1216 * list was created and dimensioned and this 1217 * point then just exit early to avoid crashes. 1218 * 1219 * We don't need to communicate that the 1220 * request list changed state during error 1221 * state capture and that the error state is 1222 * slightly incorrect as a consequence since we 1223 * are typically only interested in the request 1224 * list state at the point of error state 1225 * capture, not in any changes happening during 1226 * the capture. 1227 */ 1228 break; 1229 } 1230 1231 record_request(request, &ee->requests[count++]); 1232 } 1233 ee->num_requests = count; 1234 } 1235 1236 static void error_record_engine_execlists(const struct intel_engine_cs *engine, 1237 struct drm_i915_error_engine *ee) 1238 { 1239 const struct intel_engine_execlists * const execlists = &engine->execlists; 1240 struct i915_request * const *port = execlists->active; 1241 unsigned int n = 0; 1242 1243 while (*port) 1244 record_request(*port++, &ee->execlist[n++]); 1245 1246 ee->num_ports = n; 1247 } 1248 1249 static bool record_context(struct drm_i915_error_context *e, 1250 const struct i915_request *rq) 1251 { 1252 const struct i915_gem_context *ctx = rq->gem_context; 1253 1254 if (ctx->pid) { 1255 struct task_struct *task; 1256 1257 rcu_read_lock(); 1258 task = pid_task(ctx->pid, PIDTYPE_PID); 1259 if (task) { 1260 strcpy(e->comm, task->comm); 1261 e->pid = task->pid; 1262 } 1263 rcu_read_unlock(); 1264 } 1265 1266 e->hw_id = ctx->hw_id; 1267 e->sched_attr = ctx->sched; 1268 e->guilty = atomic_read(&ctx->guilty_count); 1269 e->active = atomic_read(&ctx->active_count); 1270 1271 return i915_gem_context_no_error_capture(ctx); 1272 } 1273 1274 struct capture_vma { 1275 struct capture_vma *next; 1276 void **slot; 1277 }; 1278 1279 static struct capture_vma * 1280 capture_vma(struct capture_vma *next, 1281 struct i915_vma *vma, 1282 struct drm_i915_error_object **out) 1283 { 1284 struct capture_vma *c; 1285 1286 *out = NULL; 1287 if (!vma) 1288 return next; 1289 1290 c = kmalloc(sizeof(*c), ATOMIC_MAYFAIL); 1291 if (!c) 1292 return next; 1293 1294 if (!i915_active_trygrab(&vma->active)) { 1295 kfree(c); 1296 return next; 1297 } 1298 1299 c->slot = (void **)out; 1300 *c->slot = i915_vma_get(vma); 1301 1302 c->next = next; 1303 return c; 1304 } 1305 1306 static struct capture_vma * 1307 request_record_user_bo(struct i915_request *request, 1308 struct drm_i915_error_engine *ee, 1309 struct capture_vma *capture) 1310 { 1311 struct i915_capture_list *c; 1312 struct drm_i915_error_object **bo; 1313 long count, max; 1314 1315 max = 0; 1316 for (c = request->capture_list; c; c = c->next) 1317 max++; 1318 if (!max) 1319 return capture; 1320 1321 bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); 1322 if (!bo) { 1323 /* If we can't capture everything, try to capture something. */ 1324 max = min_t(long, max, PAGE_SIZE / sizeof(*bo)); 1325 bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); 1326 } 1327 if (!bo) 1328 return capture; 1329 1330 count = 0; 1331 for (c = request->capture_list; c; c = c->next) { 1332 capture = capture_vma(capture, c->vma, &bo[count]); 1333 if (++count == max) 1334 break; 1335 } 1336 1337 ee->user_bo = bo; 1338 ee->user_bo_count = count; 1339 1340 return capture; 1341 } 1342 1343 static struct drm_i915_error_object * 1344 capture_object(struct drm_i915_private *dev_priv, 1345 struct drm_i915_gem_object *obj, 1346 struct compress *compress) 1347 { 1348 if (obj && i915_gem_object_has_pages(obj)) { 1349 struct i915_vma fake = { 1350 .node = { .start = U64_MAX, .size = obj->base.size }, 1351 .size = obj->base.size, 1352 .pages = obj->mm.pages, 1353 .obj = obj, 1354 }; 1355 1356 return i915_error_object_create(dev_priv, &fake, compress); 1357 } else { 1358 return NULL; 1359 } 1360 } 1361 1362 static void 1363 gem_record_rings(struct i915_gpu_state *error, struct compress *compress) 1364 { 1365 struct drm_i915_private *i915 = error->i915; 1366 struct intel_engine_cs *engine; 1367 struct drm_i915_error_engine *ee; 1368 1369 ee = kzalloc(sizeof(*ee), GFP_KERNEL); 1370 if (!ee) 1371 return; 1372 1373 for_each_uabi_engine(engine, i915) { 1374 struct capture_vma *capture = NULL; 1375 struct i915_request *request; 1376 unsigned long flags; 1377 1378 /* Refill our page pool before entering atomic section */ 1379 pool_refill(&compress->pool, ALLOW_FAIL); 1380 1381 spin_lock_irqsave(&engine->active.lock, flags); 1382 request = intel_engine_find_active_request(engine); 1383 if (!request) { 1384 spin_unlock_irqrestore(&engine->active.lock, flags); 1385 continue; 1386 } 1387 1388 error->simulated |= record_context(&ee->context, request); 1389 1390 /* 1391 * We need to copy these to an anonymous buffer 1392 * as the simplest method to avoid being overwritten 1393 * by userspace. 1394 */ 1395 capture = capture_vma(capture, 1396 request->batch, 1397 &ee->batchbuffer); 1398 1399 if (HAS_BROKEN_CS_TLB(i915)) 1400 capture = capture_vma(capture, 1401 engine->gt->scratch, 1402 &ee->wa_batchbuffer); 1403 1404 capture = request_record_user_bo(request, ee, capture); 1405 1406 capture = capture_vma(capture, 1407 request->hw_context->state, 1408 &ee->ctx); 1409 1410 capture = capture_vma(capture, 1411 request->ring->vma, 1412 &ee->ringbuffer); 1413 1414 ee->cpu_ring_head = request->ring->head; 1415 ee->cpu_ring_tail = request->ring->tail; 1416 1417 ee->rq_head = request->head; 1418 ee->rq_post = request->postfix; 1419 ee->rq_tail = request->tail; 1420 1421 engine_record_requests(engine, request, ee); 1422 spin_unlock_irqrestore(&engine->active.lock, flags); 1423 1424 error_record_engine_registers(error, engine, ee); 1425 error_record_engine_execlists(engine, ee); 1426 1427 while (capture) { 1428 struct capture_vma *this = capture; 1429 struct i915_vma *vma = *this->slot; 1430 1431 *this->slot = 1432 i915_error_object_create(i915, vma, compress); 1433 1434 i915_active_ungrab(&vma->active); 1435 i915_vma_put(vma); 1436 1437 capture = this->next; 1438 kfree(this); 1439 } 1440 1441 ee->hws_page = 1442 i915_error_object_create(i915, 1443 engine->status_page.vma, 1444 compress); 1445 1446 ee->wa_ctx = 1447 i915_error_object_create(i915, 1448 engine->wa_ctx.vma, 1449 compress); 1450 1451 ee->default_state = 1452 capture_object(i915, engine->default_state, compress); 1453 1454 ee->engine = engine; 1455 1456 ee->next = error->engine; 1457 error->engine = ee; 1458 1459 ee = kzalloc(sizeof(*ee), GFP_KERNEL); 1460 if (!ee) 1461 return; 1462 } 1463 1464 kfree(ee); 1465 } 1466 1467 static void 1468 capture_uc_state(struct i915_gpu_state *error, struct compress *compress) 1469 { 1470 struct drm_i915_private *i915 = error->i915; 1471 struct i915_error_uc *error_uc = &error->uc; 1472 struct intel_uc *uc = &i915->gt.uc; 1473 1474 /* Capturing uC state won't be useful if there is no GuC */ 1475 if (!error->device_info.has_gt_uc) 1476 return; 1477 1478 memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw)); 1479 memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw)); 1480 1481 /* Non-default firmware paths will be specified by the modparam. 1482 * As modparams are generally accesible from the userspace make 1483 * explicit copies of the firmware paths. 1484 */ 1485 error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL); 1486 error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL); 1487 error_uc->guc_log = i915_error_object_create(i915, 1488 uc->guc.log.vma, 1489 compress); 1490 } 1491 1492 /* Capture all registers which don't fit into another category. */ 1493 static void capture_reg_state(struct i915_gpu_state *error) 1494 { 1495 struct drm_i915_private *i915 = error->i915; 1496 struct intel_uncore *uncore = &i915->uncore; 1497 int i; 1498 1499 /* General organization 1500 * 1. Registers specific to a single generation 1501 * 2. Registers which belong to multiple generations 1502 * 3. Feature specific registers. 1503 * 4. Everything else 1504 * Please try to follow the order. 1505 */ 1506 1507 /* 1: Registers specific to a single generation */ 1508 if (IS_VALLEYVIEW(i915)) { 1509 error->gtier[0] = intel_uncore_read(uncore, GTIER); 1510 error->ier = intel_uncore_read(uncore, VLV_IER); 1511 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); 1512 } 1513 1514 if (IS_GEN(i915, 7)) 1515 error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); 1516 1517 if (INTEL_GEN(i915) >= 12) { 1518 error->fault_data0 = intel_uncore_read(uncore, 1519 GEN12_FAULT_TLB_DATA0); 1520 error->fault_data1 = intel_uncore_read(uncore, 1521 GEN12_FAULT_TLB_DATA1); 1522 } else if (INTEL_GEN(i915) >= 8) { 1523 error->fault_data0 = intel_uncore_read(uncore, 1524 GEN8_FAULT_TLB_DATA0); 1525 error->fault_data1 = intel_uncore_read(uncore, 1526 GEN8_FAULT_TLB_DATA1); 1527 } 1528 1529 if (IS_GEN(i915, 6)) { 1530 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); 1531 error->gab_ctl = intel_uncore_read(uncore, GAB_CTL); 1532 error->gfx_mode = intel_uncore_read(uncore, GFX_MODE); 1533 } 1534 1535 /* 2: Registers which belong to multiple generations */ 1536 if (INTEL_GEN(i915) >= 7) 1537 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); 1538 1539 if (INTEL_GEN(i915) >= 6) { 1540 error->derrmr = intel_uncore_read(uncore, DERRMR); 1541 if (INTEL_GEN(i915) < 12) { 1542 error->error = intel_uncore_read(uncore, ERROR_GEN6); 1543 error->done_reg = intel_uncore_read(uncore, DONE_REG); 1544 } 1545 } 1546 1547 if (INTEL_GEN(i915) >= 5) 1548 error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE)); 1549 1550 /* 3: Feature specific registers */ 1551 if (IS_GEN_RANGE(i915, 6, 7)) { 1552 error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); 1553 error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); 1554 } 1555 1556 /* 4: Everything else */ 1557 if (INTEL_GEN(i915) >= 11) { 1558 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); 1559 error->gtier[0] = 1560 intel_uncore_read(uncore, 1561 GEN11_RENDER_COPY_INTR_ENABLE); 1562 error->gtier[1] = 1563 intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE); 1564 error->gtier[2] = 1565 intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE); 1566 error->gtier[3] = 1567 intel_uncore_read(uncore, 1568 GEN11_GPM_WGBOXPERF_INTR_ENABLE); 1569 error->gtier[4] = 1570 intel_uncore_read(uncore, 1571 GEN11_CRYPTO_RSVD_INTR_ENABLE); 1572 error->gtier[5] = 1573 intel_uncore_read(uncore, 1574 GEN11_GUNIT_CSME_INTR_ENABLE); 1575 error->ngtier = 6; 1576 } else if (INTEL_GEN(i915) >= 8) { 1577 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); 1578 for (i = 0; i < 4; i++) 1579 error->gtier[i] = intel_uncore_read(uncore, 1580 GEN8_GT_IER(i)); 1581 error->ngtier = 4; 1582 } else if (HAS_PCH_SPLIT(i915)) { 1583 error->ier = intel_uncore_read(uncore, DEIER); 1584 error->gtier[0] = intel_uncore_read(uncore, GTIER); 1585 error->ngtier = 1; 1586 } else if (IS_GEN(i915, 2)) { 1587 error->ier = intel_uncore_read16(uncore, GEN2_IER); 1588 } else if (!IS_VALLEYVIEW(i915)) { 1589 error->ier = intel_uncore_read(uncore, GEN2_IER); 1590 } 1591 error->eir = intel_uncore_read(uncore, EIR); 1592 error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); 1593 } 1594 1595 static const char * 1596 error_msg(struct i915_gpu_state *error, 1597 intel_engine_mask_t engines, const char *msg) 1598 { 1599 int len; 1600 1601 len = scnprintf(error->error_msg, sizeof(error->error_msg), 1602 "GPU HANG: ecode %d:%x:0x%08x", 1603 INTEL_GEN(error->i915), engines, 1604 i915_error_generate_code(error)); 1605 if (error->engine) { 1606 /* Just show the first executing process, more is confusing */ 1607 len += scnprintf(error->error_msg + len, 1608 sizeof(error->error_msg) - len, 1609 ", in %s [%d]", 1610 error->engine->context.comm, 1611 error->engine->context.pid); 1612 } 1613 if (msg) 1614 len += scnprintf(error->error_msg + len, 1615 sizeof(error->error_msg) - len, 1616 ", %s", msg); 1617 1618 return error->error_msg; 1619 } 1620 1621 static void capture_gen_state(struct i915_gpu_state *error) 1622 { 1623 struct drm_i915_private *i915 = error->i915; 1624 1625 error->awake = i915->gt.awake; 1626 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count); 1627 error->suspended = i915->runtime_pm.suspended; 1628 1629 error->iommu = -1; 1630 #ifdef CONFIG_INTEL_IOMMU 1631 error->iommu = intel_iommu_gfx_mapped; 1632 #endif 1633 error->reset_count = i915_reset_count(&i915->gpu_error); 1634 error->suspend_count = i915->suspend_count; 1635 1636 memcpy(&error->device_info, 1637 INTEL_INFO(i915), 1638 sizeof(error->device_info)); 1639 memcpy(&error->runtime_info, 1640 RUNTIME_INFO(i915), 1641 sizeof(error->runtime_info)); 1642 error->driver_caps = i915->caps; 1643 } 1644 1645 static void capture_params(struct i915_gpu_state *error) 1646 { 1647 i915_params_copy(&error->params, &i915_modparams); 1648 } 1649 1650 static unsigned long capture_find_epoch(const struct i915_gpu_state *error) 1651 { 1652 const struct drm_i915_error_engine *ee; 1653 unsigned long epoch = error->capture; 1654 1655 for (ee = error->engine; ee; ee = ee->next) { 1656 if (ee->hangcheck_timestamp && 1657 time_before(ee->hangcheck_timestamp, epoch)) 1658 epoch = ee->hangcheck_timestamp; 1659 } 1660 1661 return epoch; 1662 } 1663 1664 static void capture_finish(struct i915_gpu_state *error) 1665 { 1666 struct i915_ggtt *ggtt = &error->i915->ggtt; 1667 const u64 slot = ggtt->error_capture.start; 1668 1669 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); 1670 } 1671 1672 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) 1673 1674 struct i915_gpu_state * 1675 i915_capture_gpu_state(struct drm_i915_private *i915) 1676 { 1677 struct i915_gpu_state *error; 1678 struct compress compress; 1679 1680 /* Check if GPU capture has been disabled */ 1681 error = READ_ONCE(i915->gpu_error.first_error); 1682 if (IS_ERR(error)) 1683 return error; 1684 1685 error = kzalloc(sizeof(*error), ALLOW_FAIL); 1686 if (!error) { 1687 i915_disable_error_state(i915, -ENOMEM); 1688 return ERR_PTR(-ENOMEM); 1689 } 1690 1691 if (!compress_init(&compress)) { 1692 kfree(error); 1693 i915_disable_error_state(i915, -ENOMEM); 1694 return ERR_PTR(-ENOMEM); 1695 } 1696 1697 kref_init(&error->ref); 1698 error->i915 = i915; 1699 1700 error->time = ktime_get_real(); 1701 error->boottime = ktime_get_boottime(); 1702 error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time); 1703 error->capture = jiffies; 1704 1705 capture_params(error); 1706 capture_gen_state(error); 1707 capture_uc_state(error, &compress); 1708 capture_reg_state(error); 1709 gem_record_fences(error); 1710 gem_record_rings(error, &compress); 1711 1712 error->overlay = intel_overlay_capture_error_state(i915); 1713 error->display = intel_display_capture_error_state(i915); 1714 1715 error->epoch = capture_find_epoch(error); 1716 1717 capture_finish(error); 1718 compress_fini(&compress); 1719 1720 return error; 1721 } 1722 1723 /** 1724 * i915_capture_error_state - capture an error record for later analysis 1725 * @i915: i915 device 1726 * @engine_mask: the mask of engines triggering the hang 1727 * @msg: a message to insert into the error capture header 1728 * 1729 * Should be called when an error is detected (either a hang or an error 1730 * interrupt) to capture error state from the time of the error. Fills 1731 * out a structure which becomes available in debugfs for user level tools 1732 * to pick up. 1733 */ 1734 void i915_capture_error_state(struct drm_i915_private *i915, 1735 intel_engine_mask_t engine_mask, 1736 const char *msg) 1737 { 1738 static bool warned; 1739 struct i915_gpu_state *error; 1740 unsigned long flags; 1741 1742 if (!i915_modparams.error_capture) 1743 return; 1744 1745 if (READ_ONCE(i915->gpu_error.first_error)) 1746 return; 1747 1748 error = i915_capture_gpu_state(i915); 1749 if (IS_ERR(error)) 1750 return; 1751 1752 dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg)); 1753 1754 if (!error->simulated) { 1755 spin_lock_irqsave(&i915->gpu_error.lock, flags); 1756 if (!i915->gpu_error.first_error) { 1757 i915->gpu_error.first_error = error; 1758 error = NULL; 1759 } 1760 spin_unlock_irqrestore(&i915->gpu_error.lock, flags); 1761 } 1762 1763 if (error) { 1764 __i915_gpu_state_free(&error->ref); 1765 return; 1766 } 1767 1768 if (!xchg(&warned, true) && 1769 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { 1770 pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); 1771 pr_info("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); 1772 pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); 1773 pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n"); 1774 pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n", 1775 i915->drm.primary->index); 1776 } 1777 } 1778 1779 struct i915_gpu_state * 1780 i915_first_error_state(struct drm_i915_private *i915) 1781 { 1782 struct i915_gpu_state *error; 1783 1784 spin_lock_irq(&i915->gpu_error.lock); 1785 error = i915->gpu_error.first_error; 1786 if (!IS_ERR_OR_NULL(error)) 1787 i915_gpu_state_get(error); 1788 spin_unlock_irq(&i915->gpu_error.lock); 1789 1790 return error; 1791 } 1792 1793 void i915_reset_error_state(struct drm_i915_private *i915) 1794 { 1795 struct i915_gpu_state *error; 1796 1797 spin_lock_irq(&i915->gpu_error.lock); 1798 error = i915->gpu_error.first_error; 1799 if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */ 1800 i915->gpu_error.first_error = NULL; 1801 spin_unlock_irq(&i915->gpu_error.lock); 1802 1803 if (!IS_ERR_OR_NULL(error)) 1804 i915_gpu_state_put(error); 1805 } 1806 1807 void i915_disable_error_state(struct drm_i915_private *i915, int err) 1808 { 1809 spin_lock_irq(&i915->gpu_error.lock); 1810 if (!i915->gpu_error.first_error) 1811 i915->gpu_error.first_error = ERR_PTR(err); 1812 spin_unlock_irq(&i915->gpu_error.lock); 1813 } 1814