1 /* 2 * Copyright (c) 2008 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * Keith Packard <keithp@keithp.com> 26 * Mika Kuoppala <mika.kuoppala@intel.com> 27 * 28 */ 29 30 #include <linux/ascii85.h> 31 #include <linux/nmi.h> 32 #include <linux/pagevec.h> 33 #include <linux/scatterlist.h> 34 #include <linux/utsname.h> 35 #include <linux/zlib.h> 36 37 #include <drm/drm_print.h> 38 39 #include "display/intel_atomic.h" 40 #include "display/intel_overlay.h" 41 42 #include "gem/i915_gem_context.h" 43 44 #include "i915_drv.h" 45 #include "i915_gpu_error.h" 46 #include "i915_scatterlist.h" 47 #include "intel_csr.h" 48 49 #define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) 50 #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN) 51 52 static inline const struct intel_engine_cs * 53 engine_lookup(const struct drm_i915_private *i915, unsigned int id) 54 { 55 if (id >= I915_NUM_ENGINES) 56 return NULL; 57 58 return i915->engine[id]; 59 } 60 61 static inline const char * 62 __engine_name(const struct intel_engine_cs *engine) 63 { 64 return engine ? engine->name : ""; 65 } 66 67 static const char * 68 engine_name(const struct drm_i915_private *i915, unsigned int id) 69 { 70 return __engine_name(engine_lookup(i915, id)); 71 } 72 73 static void __sg_set_buf(struct scatterlist *sg, 74 void *addr, unsigned int len, loff_t it) 75 { 76 sg->page_link = (unsigned long)virt_to_page(addr); 77 sg->offset = offset_in_page(addr); 78 sg->length = len; 79 sg->dma_address = it; 80 } 81 82 static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) 83 { 84 if (!len) 85 return false; 86 87 if (e->bytes + len + 1 <= e->size) 88 return true; 89 90 if (e->bytes) { 91 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter); 92 e->iter += e->bytes; 93 e->buf = NULL; 94 e->bytes = 0; 95 } 96 97 if (e->cur == e->end) { 98 struct scatterlist *sgl; 99 100 sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL); 101 if (!sgl) { 102 e->err = -ENOMEM; 103 return false; 104 } 105 106 if (e->cur) { 107 e->cur->offset = 0; 108 e->cur->length = 0; 109 e->cur->page_link = 110 (unsigned long)sgl | SG_CHAIN; 111 } else { 112 e->sgl = sgl; 113 } 114 115 e->cur = sgl; 116 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1; 117 } 118 119 e->size = ALIGN(len + 1, SZ_64K); 120 e->buf = kmalloc(e->size, ALLOW_FAIL); 121 if (!e->buf) { 122 e->size = PAGE_ALIGN(len + 1); 123 e->buf = kmalloc(e->size, GFP_KERNEL); 124 } 125 if (!e->buf) { 126 e->err = -ENOMEM; 127 return false; 128 } 129 130 return true; 131 } 132 133 __printf(2, 0) 134 static void i915_error_vprintf(struct drm_i915_error_state_buf *e, 135 const char *fmt, va_list args) 136 { 137 va_list ap; 138 int len; 139 140 if (e->err) 141 return; 142 143 va_copy(ap, args); 144 len = vsnprintf(NULL, 0, fmt, ap); 145 va_end(ap); 146 if (len <= 0) { 147 e->err = len; 148 return; 149 } 150 151 if (!__i915_error_grow(e, len)) 152 return; 153 154 GEM_BUG_ON(e->bytes >= e->size); 155 len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args); 156 if (len < 0) { 157 e->err = len; 158 return; 159 } 160 e->bytes += len; 161 } 162 163 static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str) 164 { 165 unsigned len; 166 167 if (e->err || !str) 168 return; 169 170 len = strlen(str); 171 if (!__i915_error_grow(e, len)) 172 return; 173 174 GEM_BUG_ON(e->bytes + len > e->size); 175 memcpy(e->buf + e->bytes, str, len); 176 e->bytes += len; 177 } 178 179 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__) 180 #define err_puts(e, s) i915_error_puts(e, s) 181 182 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf) 183 { 184 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va); 185 } 186 187 static inline struct drm_printer 188 i915_error_printer(struct drm_i915_error_state_buf *e) 189 { 190 struct drm_printer p = { 191 .printfn = __i915_printfn_error, 192 .arg = e, 193 }; 194 return p; 195 } 196 197 /* single threaded page allocator with a reserved stash for emergencies */ 198 static void pool_fini(struct pagevec *pv) 199 { 200 pagevec_release(pv); 201 } 202 203 static int pool_refill(struct pagevec *pv, gfp_t gfp) 204 { 205 while (pagevec_space(pv)) { 206 struct page *p; 207 208 p = alloc_page(gfp); 209 if (!p) 210 return -ENOMEM; 211 212 pagevec_add(pv, p); 213 } 214 215 return 0; 216 } 217 218 static int pool_init(struct pagevec *pv, gfp_t gfp) 219 { 220 int err; 221 222 pagevec_init(pv); 223 224 err = pool_refill(pv, gfp); 225 if (err) 226 pool_fini(pv); 227 228 return err; 229 } 230 231 static void *pool_alloc(struct pagevec *pv, gfp_t gfp) 232 { 233 struct page *p; 234 235 p = alloc_page(gfp); 236 if (!p && pagevec_count(pv)) 237 p = pv->pages[--pv->nr]; 238 239 return p ? page_address(p) : NULL; 240 } 241 242 static void pool_free(struct pagevec *pv, void *addr) 243 { 244 struct page *p = virt_to_page(addr); 245 246 if (pagevec_space(pv)) 247 pagevec_add(pv, p); 248 else 249 __free_page(p); 250 } 251 252 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR 253 254 struct compress { 255 struct pagevec pool; 256 struct z_stream_s zstream; 257 void *tmp; 258 }; 259 260 static bool compress_init(struct compress *c) 261 { 262 struct z_stream_s *zstream = &c->zstream; 263 264 if (pool_init(&c->pool, ALLOW_FAIL)) 265 return false; 266 267 zstream->workspace = 268 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), 269 ALLOW_FAIL); 270 if (!zstream->workspace) { 271 pool_fini(&c->pool); 272 return false; 273 } 274 275 c->tmp = NULL; 276 if (i915_has_memcpy_from_wc()) 277 c->tmp = pool_alloc(&c->pool, ALLOW_FAIL); 278 279 return true; 280 } 281 282 static bool compress_start(struct compress *c) 283 { 284 struct z_stream_s *zstream = &c->zstream; 285 void *workspace = zstream->workspace; 286 287 memset(zstream, 0, sizeof(*zstream)); 288 zstream->workspace = workspace; 289 290 return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK; 291 } 292 293 static void *compress_next_page(struct compress *c, 294 struct drm_i915_error_object *dst) 295 { 296 void *page; 297 298 if (dst->page_count >= dst->num_pages) 299 return ERR_PTR(-ENOSPC); 300 301 page = pool_alloc(&c->pool, ALLOW_FAIL); 302 if (!page) 303 return ERR_PTR(-ENOMEM); 304 305 return dst->pages[dst->page_count++] = page; 306 } 307 308 static int compress_page(struct compress *c, 309 void *src, 310 struct drm_i915_error_object *dst) 311 { 312 struct z_stream_s *zstream = &c->zstream; 313 314 zstream->next_in = src; 315 if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) 316 zstream->next_in = c->tmp; 317 zstream->avail_in = PAGE_SIZE; 318 319 do { 320 if (zstream->avail_out == 0) { 321 zstream->next_out = compress_next_page(c, dst); 322 if (IS_ERR(zstream->next_out)) 323 return PTR_ERR(zstream->next_out); 324 325 zstream->avail_out = PAGE_SIZE; 326 } 327 328 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) 329 return -EIO; 330 } while (zstream->avail_in); 331 332 /* Fallback to uncompressed if we increase size? */ 333 if (0 && zstream->total_out > zstream->total_in) 334 return -E2BIG; 335 336 return 0; 337 } 338 339 static int compress_flush(struct compress *c, 340 struct drm_i915_error_object *dst) 341 { 342 struct z_stream_s *zstream = &c->zstream; 343 344 do { 345 switch (zlib_deflate(zstream, Z_FINISH)) { 346 case Z_OK: /* more space requested */ 347 zstream->next_out = compress_next_page(c, dst); 348 if (IS_ERR(zstream->next_out)) 349 return PTR_ERR(zstream->next_out); 350 351 zstream->avail_out = PAGE_SIZE; 352 break; 353 354 case Z_STREAM_END: 355 goto end; 356 357 default: /* any error */ 358 return -EIO; 359 } 360 } while (1); 361 362 end: 363 memset(zstream->next_out, 0, zstream->avail_out); 364 dst->unused = zstream->avail_out; 365 return 0; 366 } 367 368 static void compress_finish(struct compress *c) 369 { 370 zlib_deflateEnd(&c->zstream); 371 } 372 373 static void compress_fini(struct compress *c) 374 { 375 kfree(c->zstream.workspace); 376 if (c->tmp) 377 pool_free(&c->pool, c->tmp); 378 pool_fini(&c->pool); 379 } 380 381 static void err_compression_marker(struct drm_i915_error_state_buf *m) 382 { 383 err_puts(m, ":"); 384 } 385 386 #else 387 388 struct compress { 389 struct pagevec pool; 390 }; 391 392 static bool compress_init(struct compress *c) 393 { 394 return pool_init(&c->pool, ALLOW_FAIL) == 0; 395 } 396 397 static bool compress_start(struct compress *c) 398 { 399 return true; 400 } 401 402 static int compress_page(struct compress *c, 403 void *src, 404 struct drm_i915_error_object *dst) 405 { 406 void *ptr; 407 408 ptr = pool_alloc(&c->pool, ALLOW_FAIL); 409 if (!ptr) 410 return -ENOMEM; 411 412 if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE)) 413 memcpy(ptr, src, PAGE_SIZE); 414 dst->pages[dst->page_count++] = ptr; 415 416 return 0; 417 } 418 419 static int compress_flush(struct compress *c, 420 struct drm_i915_error_object *dst) 421 { 422 return 0; 423 } 424 425 static void compress_finish(struct compress *c) 426 { 427 } 428 429 static void compress_fini(struct compress *c) 430 { 431 pool_fini(&c->pool); 432 } 433 434 static void err_compression_marker(struct drm_i915_error_state_buf *m) 435 { 436 err_puts(m, "~"); 437 } 438 439 #endif 440 441 static void error_print_instdone(struct drm_i915_error_state_buf *m, 442 const struct drm_i915_error_engine *ee) 443 { 444 int slice; 445 int subslice; 446 447 err_printf(m, " INSTDONE: 0x%08x\n", 448 ee->instdone.instdone); 449 450 if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3) 451 return; 452 453 err_printf(m, " SC_INSTDONE: 0x%08x\n", 454 ee->instdone.slice_common); 455 456 if (INTEL_GEN(m->i915) <= 6) 457 return; 458 459 for_each_instdone_slice_subslice(m->i915, slice, subslice) 460 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n", 461 slice, subslice, 462 ee->instdone.sampler[slice][subslice]); 463 464 for_each_instdone_slice_subslice(m->i915, slice, subslice) 465 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n", 466 slice, subslice, 467 ee->instdone.row[slice][subslice]); 468 } 469 470 static void error_print_request(struct drm_i915_error_state_buf *m, 471 const char *prefix, 472 const struct drm_i915_error_request *erq, 473 const unsigned long epoch) 474 { 475 if (!erq->seqno) 476 return; 477 478 err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", 479 prefix, erq->pid, erq->context, erq->seqno, 480 test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 481 &erq->flags) ? "!" : "", 482 test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 483 &erq->flags) ? "+" : "", 484 erq->sched_attr.priority, 485 jiffies_to_msecs(erq->jiffies - epoch), 486 erq->start, erq->head, erq->tail); 487 } 488 489 static void error_print_context(struct drm_i915_error_state_buf *m, 490 const char *header, 491 const struct drm_i915_error_context *ctx) 492 { 493 err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n", 494 header, ctx->comm, ctx->pid, ctx->hw_id, 495 ctx->sched_attr.priority, ctx->guilty, ctx->active); 496 } 497 498 static void error_print_engine(struct drm_i915_error_state_buf *m, 499 const struct drm_i915_error_engine *ee, 500 const unsigned long epoch) 501 { 502 int n; 503 504 err_printf(m, "%s command stream:\n", 505 engine_name(m->i915, ee->engine_id)); 506 err_printf(m, " IDLE?: %s\n", yesno(ee->idle)); 507 err_printf(m, " START: 0x%08x\n", ee->start); 508 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head); 509 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n", 510 ee->tail, ee->rq_post, ee->rq_tail); 511 err_printf(m, " CTL: 0x%08x\n", ee->ctl); 512 err_printf(m, " MODE: 0x%08x\n", ee->mode); 513 err_printf(m, " HWS: 0x%08x\n", ee->hws); 514 err_printf(m, " ACTHD: 0x%08x %08x\n", 515 (u32)(ee->acthd>>32), (u32)ee->acthd); 516 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir); 517 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr); 518 519 error_print_instdone(m, ee); 520 521 if (ee->batchbuffer) { 522 u64 start = ee->batchbuffer->gtt_offset; 523 u64 end = start + ee->batchbuffer->gtt_size; 524 525 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n", 526 upper_32_bits(start), lower_32_bits(start), 527 upper_32_bits(end), lower_32_bits(end)); 528 } 529 if (INTEL_GEN(m->i915) >= 4) { 530 err_printf(m, " BBADDR: 0x%08x_%08x\n", 531 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr); 532 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate); 533 err_printf(m, " INSTPS: 0x%08x\n", ee->instps); 534 } 535 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm); 536 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr), 537 lower_32_bits(ee->faddr)); 538 if (INTEL_GEN(m->i915) >= 6) { 539 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi); 540 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg); 541 } 542 if (HAS_PPGTT(m->i915)) { 543 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode); 544 545 if (INTEL_GEN(m->i915) >= 8) { 546 int i; 547 for (i = 0; i < 4; i++) 548 err_printf(m, " PDP%d: 0x%016llx\n", 549 i, ee->vm_info.pdp[i]); 550 } else { 551 err_printf(m, " PP_DIR_BASE: 0x%08x\n", 552 ee->vm_info.pp_dir_base); 553 } 554 } 555 err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); 556 err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); 557 err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n", 558 jiffies_to_msecs(ee->hangcheck_timestamp - epoch), 559 ee->hangcheck_timestamp, 560 ee->hangcheck_timestamp == epoch ? "; epoch" : ""); 561 err_printf(m, " engine reset count: %u\n", ee->reset_count); 562 563 for (n = 0; n < ee->num_ports; n++) { 564 err_printf(m, " ELSP[%d]:", n); 565 error_print_request(m, " ", &ee->execlist[n], epoch); 566 } 567 568 error_print_context(m, " Active context: ", &ee->context); 569 } 570 571 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) 572 { 573 va_list args; 574 575 va_start(args, f); 576 i915_error_vprintf(e, f, args); 577 va_end(args); 578 } 579 580 static void print_error_obj(struct drm_i915_error_state_buf *m, 581 struct intel_engine_cs *engine, 582 const char *name, 583 struct drm_i915_error_object *obj) 584 { 585 char out[ASCII85_BUFSZ]; 586 int page; 587 588 if (!obj) 589 return; 590 591 if (name) { 592 err_printf(m, "%s --- %s = 0x%08x %08x\n", 593 engine ? engine->name : "global", name, 594 upper_32_bits(obj->gtt_offset), 595 lower_32_bits(obj->gtt_offset)); 596 } 597 598 err_compression_marker(m); 599 for (page = 0; page < obj->page_count; page++) { 600 int i, len; 601 602 len = PAGE_SIZE; 603 if (page == obj->page_count - 1) 604 len -= obj->unused; 605 len = ascii85_encode_len(len); 606 607 for (i = 0; i < len; i++) 608 err_puts(m, ascii85_encode(obj->pages[page][i], out)); 609 } 610 err_puts(m, "\n"); 611 } 612 613 static void err_print_capabilities(struct drm_i915_error_state_buf *m, 614 const struct intel_device_info *info, 615 const struct intel_runtime_info *runtime, 616 const struct intel_driver_caps *caps) 617 { 618 struct drm_printer p = i915_error_printer(m); 619 620 intel_device_info_dump_flags(info, &p); 621 intel_driver_caps_print(caps, &p); 622 intel_device_info_dump_topology(&runtime->sseu, &p); 623 } 624 625 static void err_print_params(struct drm_i915_error_state_buf *m, 626 const struct i915_params *params) 627 { 628 struct drm_printer p = i915_error_printer(m); 629 630 i915_params_dump(params, &p); 631 } 632 633 static void err_print_pciid(struct drm_i915_error_state_buf *m, 634 struct drm_i915_private *i915) 635 { 636 struct pci_dev *pdev = i915->drm.pdev; 637 638 err_printf(m, "PCI ID: 0x%04x\n", pdev->device); 639 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision); 640 err_printf(m, "PCI Subsystem: %04x:%04x\n", 641 pdev->subsystem_vendor, 642 pdev->subsystem_device); 643 } 644 645 static void err_print_uc(struct drm_i915_error_state_buf *m, 646 const struct i915_error_uc *error_uc) 647 { 648 struct drm_printer p = i915_error_printer(m); 649 const struct i915_gpu_state *error = 650 container_of(error_uc, typeof(*error), uc); 651 652 if (!error->device_info.has_gt_uc) 653 return; 654 655 intel_uc_fw_dump(&error_uc->guc_fw, &p); 656 intel_uc_fw_dump(&error_uc->huc_fw, &p); 657 print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log); 658 } 659 660 static void err_free_sgl(struct scatterlist *sgl) 661 { 662 while (sgl) { 663 struct scatterlist *sg; 664 665 for (sg = sgl; !sg_is_chain(sg); sg++) { 666 kfree(sg_virt(sg)); 667 if (sg_is_last(sg)) 668 break; 669 } 670 671 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg); 672 free_page((unsigned long)sgl); 673 sgl = sg; 674 } 675 } 676 677 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, 678 struct i915_gpu_state *error) 679 { 680 struct drm_i915_error_object *obj; 681 struct timespec64 ts; 682 int i, j; 683 684 if (*error->error_msg) 685 err_printf(m, "%s\n", error->error_msg); 686 err_printf(m, "Kernel: %s %s\n", 687 init_utsname()->release, 688 init_utsname()->machine); 689 ts = ktime_to_timespec64(error->time); 690 err_printf(m, "Time: %lld s %ld us\n", 691 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 692 ts = ktime_to_timespec64(error->boottime); 693 err_printf(m, "Boottime: %lld s %ld us\n", 694 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 695 ts = ktime_to_timespec64(error->uptime); 696 err_printf(m, "Uptime: %lld s %ld us\n", 697 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 698 err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ); 699 err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n", 700 error->capture, 701 jiffies_to_msecs(jiffies - error->capture), 702 jiffies_to_msecs(error->capture - error->epoch)); 703 704 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 705 if (!error->engine[i].context.pid) 706 continue; 707 708 err_printf(m, "Active process (on ring %s): %s [%d]\n", 709 engine_name(m->i915, i), 710 error->engine[i].context.comm, 711 error->engine[i].context.pid); 712 } 713 err_printf(m, "Reset count: %u\n", error->reset_count); 714 err_printf(m, "Suspend count: %u\n", error->suspend_count); 715 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); 716 err_printf(m, "Subplatform: 0x%x\n", 717 intel_subplatform(&error->runtime_info, 718 error->device_info.platform)); 719 err_print_pciid(m, m->i915); 720 721 err_printf(m, "IOMMU enabled?: %d\n", error->iommu); 722 723 if (HAS_CSR(m->i915)) { 724 struct intel_csr *csr = &m->i915->csr; 725 726 err_printf(m, "DMC loaded: %s\n", 727 yesno(csr->dmc_payload != NULL)); 728 err_printf(m, "DMC fw version: %d.%d\n", 729 CSR_VERSION_MAJOR(csr->version), 730 CSR_VERSION_MINOR(csr->version)); 731 } 732 733 err_printf(m, "GT awake: %s\n", yesno(error->awake)); 734 err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock)); 735 err_printf(m, "PM suspended: %s\n", yesno(error->suspended)); 736 err_printf(m, "EIR: 0x%08x\n", error->eir); 737 err_printf(m, "IER: 0x%08x\n", error->ier); 738 for (i = 0; i < error->ngtier; i++) 739 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); 740 err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); 741 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); 742 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); 743 err_printf(m, "CCID: 0x%08x\n", error->ccid); 744 745 for (i = 0; i < error->nfence; i++) 746 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); 747 748 if (INTEL_GEN(m->i915) >= 6) { 749 err_printf(m, "ERROR: 0x%08x\n", error->error); 750 751 if (INTEL_GEN(m->i915) >= 8) 752 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", 753 error->fault_data1, error->fault_data0); 754 755 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg); 756 } 757 758 if (IS_GEN(m->i915, 7)) 759 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); 760 761 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 762 if (error->engine[i].engine_id != -1) 763 error_print_engine(m, &error->engine[i], error->epoch); 764 } 765 766 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 767 const struct drm_i915_error_engine *ee = &error->engine[i]; 768 769 obj = ee->batchbuffer; 770 if (obj) { 771 err_puts(m, m->i915->engine[i]->name); 772 if (ee->context.pid) 773 err_printf(m, " (submitted by %s [%d])", 774 ee->context.comm, 775 ee->context.pid); 776 err_printf(m, " --- gtt_offset = 0x%08x %08x\n", 777 upper_32_bits(obj->gtt_offset), 778 lower_32_bits(obj->gtt_offset)); 779 print_error_obj(m, m->i915->engine[i], NULL, obj); 780 } 781 782 for (j = 0; j < ee->user_bo_count; j++) 783 print_error_obj(m, m->i915->engine[i], 784 "user", ee->user_bo[j]); 785 786 if (ee->num_requests) { 787 err_printf(m, "%s --- %d requests\n", 788 m->i915->engine[i]->name, 789 ee->num_requests); 790 for (j = 0; j < ee->num_requests; j++) 791 error_print_request(m, " ", 792 &ee->requests[j], 793 error->epoch); 794 } 795 796 print_error_obj(m, m->i915->engine[i], 797 "ringbuffer", ee->ringbuffer); 798 799 print_error_obj(m, m->i915->engine[i], 800 "HW Status", ee->hws_page); 801 802 print_error_obj(m, m->i915->engine[i], 803 "HW context", ee->ctx); 804 805 print_error_obj(m, m->i915->engine[i], 806 "WA context", ee->wa_ctx); 807 808 print_error_obj(m, m->i915->engine[i], 809 "WA batchbuffer", ee->wa_batchbuffer); 810 811 print_error_obj(m, m->i915->engine[i], 812 "NULL context", ee->default_state); 813 } 814 815 if (error->overlay) 816 intel_overlay_print_error_state(m, error->overlay); 817 818 if (error->display) 819 intel_display_print_error_state(m, error->display); 820 821 err_print_capabilities(m, &error->device_info, &error->runtime_info, 822 &error->driver_caps); 823 err_print_params(m, &error->params); 824 err_print_uc(m, &error->uc); 825 } 826 827 static int err_print_to_sgl(struct i915_gpu_state *error) 828 { 829 struct drm_i915_error_state_buf m; 830 831 if (IS_ERR(error)) 832 return PTR_ERR(error); 833 834 if (READ_ONCE(error->sgl)) 835 return 0; 836 837 memset(&m, 0, sizeof(m)); 838 m.i915 = error->i915; 839 840 __err_print_to_sgl(&m, error); 841 842 if (m.buf) { 843 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter); 844 m.bytes = 0; 845 m.buf = NULL; 846 } 847 if (m.cur) { 848 GEM_BUG_ON(m.end < m.cur); 849 sg_mark_end(m.cur - 1); 850 } 851 GEM_BUG_ON(m.sgl && !m.cur); 852 853 if (m.err) { 854 err_free_sgl(m.sgl); 855 return m.err; 856 } 857 858 if (cmpxchg(&error->sgl, NULL, m.sgl)) 859 err_free_sgl(m.sgl); 860 861 return 0; 862 } 863 864 ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, 865 char *buf, loff_t off, size_t rem) 866 { 867 struct scatterlist *sg; 868 size_t count; 869 loff_t pos; 870 int err; 871 872 if (!error || !rem) 873 return 0; 874 875 err = err_print_to_sgl(error); 876 if (err) 877 return err; 878 879 sg = READ_ONCE(error->fit); 880 if (!sg || off < sg->dma_address) 881 sg = error->sgl; 882 if (!sg) 883 return 0; 884 885 pos = sg->dma_address; 886 count = 0; 887 do { 888 size_t len, start; 889 890 if (sg_is_chain(sg)) { 891 sg = sg_chain_ptr(sg); 892 GEM_BUG_ON(sg_is_chain(sg)); 893 } 894 895 len = sg->length; 896 if (pos + len <= off) { 897 pos += len; 898 continue; 899 } 900 901 start = sg->offset; 902 if (pos < off) { 903 GEM_BUG_ON(off - pos > len); 904 len -= off - pos; 905 start += off - pos; 906 pos = off; 907 } 908 909 len = min(len, rem); 910 GEM_BUG_ON(!len || len > sg->length); 911 912 memcpy(buf, page_address(sg_page(sg)) + start, len); 913 914 count += len; 915 pos += len; 916 917 buf += len; 918 rem -= len; 919 if (!rem) { 920 WRITE_ONCE(error->fit, sg); 921 break; 922 } 923 } while (!sg_is_last(sg++)); 924 925 return count; 926 } 927 928 static void i915_error_object_free(struct drm_i915_error_object *obj) 929 { 930 int page; 931 932 if (obj == NULL) 933 return; 934 935 for (page = 0; page < obj->page_count; page++) 936 free_page((unsigned long)obj->pages[page]); 937 938 kfree(obj); 939 } 940 941 942 static void cleanup_params(struct i915_gpu_state *error) 943 { 944 i915_params_free(&error->params); 945 } 946 947 static void cleanup_uc_state(struct i915_gpu_state *error) 948 { 949 struct i915_error_uc *error_uc = &error->uc; 950 951 kfree(error_uc->guc_fw.path); 952 kfree(error_uc->huc_fw.path); 953 i915_error_object_free(error_uc->guc_log); 954 } 955 956 void __i915_gpu_state_free(struct kref *error_ref) 957 { 958 struct i915_gpu_state *error = 959 container_of(error_ref, typeof(*error), ref); 960 long i, j; 961 962 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 963 struct drm_i915_error_engine *ee = &error->engine[i]; 964 965 for (j = 0; j < ee->user_bo_count; j++) 966 i915_error_object_free(ee->user_bo[j]); 967 kfree(ee->user_bo); 968 969 i915_error_object_free(ee->batchbuffer); 970 i915_error_object_free(ee->wa_batchbuffer); 971 i915_error_object_free(ee->ringbuffer); 972 i915_error_object_free(ee->hws_page); 973 i915_error_object_free(ee->ctx); 974 i915_error_object_free(ee->wa_ctx); 975 976 kfree(ee->requests); 977 } 978 979 kfree(error->overlay); 980 kfree(error->display); 981 982 cleanup_params(error); 983 cleanup_uc_state(error); 984 985 err_free_sgl(error->sgl); 986 kfree(error); 987 } 988 989 static struct drm_i915_error_object * 990 i915_error_object_create(struct drm_i915_private *i915, 991 struct i915_vma *vma, 992 struct compress *compress) 993 { 994 struct i915_ggtt *ggtt = &i915->ggtt; 995 const u64 slot = ggtt->error_capture.start; 996 struct drm_i915_error_object *dst; 997 unsigned long num_pages; 998 struct sgt_iter iter; 999 dma_addr_t dma; 1000 int ret; 1001 1002 might_sleep(); 1003 1004 if (!vma || !vma->pages) 1005 return NULL; 1006 1007 num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; 1008 num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */ 1009 dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ALLOW_FAIL); 1010 if (!dst) 1011 return NULL; 1012 1013 if (!compress_start(compress)) { 1014 kfree(dst); 1015 return NULL; 1016 } 1017 1018 dst->gtt_offset = vma->node.start; 1019 dst->gtt_size = vma->node.size; 1020 dst->num_pages = num_pages; 1021 dst->page_count = 0; 1022 dst->unused = 0; 1023 1024 ret = -EINVAL; 1025 for_each_sgt_dma(dma, iter, vma->pages) { 1026 void __iomem *s; 1027 1028 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); 1029 1030 s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE); 1031 ret = compress_page(compress, (void __force *)s, dst); 1032 io_mapping_unmap(s); 1033 if (ret) 1034 break; 1035 } 1036 1037 if (ret || compress_flush(compress, dst)) { 1038 while (dst->page_count--) 1039 pool_free(&compress->pool, dst->pages[dst->page_count]); 1040 kfree(dst); 1041 dst = NULL; 1042 } 1043 compress_finish(compress); 1044 1045 return dst; 1046 } 1047 1048 /* 1049 * Generate a semi-unique error code. The code is not meant to have meaning, The 1050 * code's only purpose is to try to prevent false duplicated bug reports by 1051 * grossly estimating a GPU error state. 1052 * 1053 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine 1054 * the hang if we could strip the GTT offset information from it. 1055 * 1056 * It's only a small step better than a random number in its current form. 1057 */ 1058 static u32 i915_error_generate_code(struct i915_gpu_state *error, 1059 intel_engine_mask_t engine_mask) 1060 { 1061 /* 1062 * IPEHR would be an ideal way to detect errors, as it's the gross 1063 * measure of "the command that hung." However, has some very common 1064 * synchronization commands which almost always appear in the case 1065 * strictly a client bug. Use instdone to differentiate those some. 1066 */ 1067 if (engine_mask) { 1068 struct drm_i915_error_engine *ee = 1069 &error->engine[ffs(engine_mask)]; 1070 1071 return ee->ipehr ^ ee->instdone.instdone; 1072 } 1073 1074 return 0; 1075 } 1076 1077 static void gem_record_fences(struct i915_gpu_state *error) 1078 { 1079 struct drm_i915_private *dev_priv = error->i915; 1080 struct intel_uncore *uncore = &dev_priv->uncore; 1081 int i; 1082 1083 if (INTEL_GEN(dev_priv) >= 6) { 1084 for (i = 0; i < dev_priv->ggtt.num_fences; i++) 1085 error->fence[i] = 1086 intel_uncore_read64(uncore, 1087 FENCE_REG_GEN6_LO(i)); 1088 } else if (INTEL_GEN(dev_priv) >= 4) { 1089 for (i = 0; i < dev_priv->ggtt.num_fences; i++) 1090 error->fence[i] = 1091 intel_uncore_read64(uncore, 1092 FENCE_REG_965_LO(i)); 1093 } else { 1094 for (i = 0; i < dev_priv->ggtt.num_fences; i++) 1095 error->fence[i] = 1096 intel_uncore_read(uncore, FENCE_REG(i)); 1097 } 1098 error->nfence = i; 1099 } 1100 1101 static void error_record_engine_registers(struct i915_gpu_state *error, 1102 struct intel_engine_cs *engine, 1103 struct drm_i915_error_engine *ee) 1104 { 1105 struct drm_i915_private *dev_priv = engine->i915; 1106 1107 if (INTEL_GEN(dev_priv) >= 6) { 1108 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); 1109 if (INTEL_GEN(dev_priv) >= 8) 1110 ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG); 1111 else 1112 ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine); 1113 } 1114 1115 if (INTEL_GEN(dev_priv) >= 4) { 1116 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD); 1117 ee->ipeir = ENGINE_READ(engine, RING_IPEIR); 1118 ee->ipehr = ENGINE_READ(engine, RING_IPEHR); 1119 ee->instps = ENGINE_READ(engine, RING_INSTPS); 1120 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR); 1121 if (INTEL_GEN(dev_priv) >= 8) { 1122 ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32; 1123 ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32; 1124 } 1125 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE); 1126 } else { 1127 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX); 1128 ee->ipeir = ENGINE_READ(engine, IPEIR); 1129 ee->ipehr = ENGINE_READ(engine, IPEHR); 1130 } 1131 1132 intel_engine_get_instdone(engine, &ee->instdone); 1133 1134 ee->instpm = ENGINE_READ(engine, RING_INSTPM); 1135 ee->acthd = intel_engine_get_active_head(engine); 1136 ee->start = ENGINE_READ(engine, RING_START); 1137 ee->head = ENGINE_READ(engine, RING_HEAD); 1138 ee->tail = ENGINE_READ(engine, RING_TAIL); 1139 ee->ctl = ENGINE_READ(engine, RING_CTL); 1140 if (INTEL_GEN(dev_priv) > 2) 1141 ee->mode = ENGINE_READ(engine, RING_MI_MODE); 1142 1143 if (!HWS_NEEDS_PHYSICAL(dev_priv)) { 1144 i915_reg_t mmio; 1145 1146 if (IS_GEN(dev_priv, 7)) { 1147 switch (engine->id) { 1148 default: 1149 MISSING_CASE(engine->id); 1150 /* fall through */ 1151 case RCS0: 1152 mmio = RENDER_HWS_PGA_GEN7; 1153 break; 1154 case BCS0: 1155 mmio = BLT_HWS_PGA_GEN7; 1156 break; 1157 case VCS0: 1158 mmio = BSD_HWS_PGA_GEN7; 1159 break; 1160 case VECS0: 1161 mmio = VEBOX_HWS_PGA_GEN7; 1162 break; 1163 } 1164 } else if (IS_GEN(engine->i915, 6)) { 1165 mmio = RING_HWS_PGA_GEN6(engine->mmio_base); 1166 } else { 1167 /* XXX: gen8 returns to sanity */ 1168 mmio = RING_HWS_PGA(engine->mmio_base); 1169 } 1170 1171 ee->hws = I915_READ(mmio); 1172 } 1173 1174 ee->idle = intel_engine_is_idle(engine); 1175 if (!ee->idle) 1176 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; 1177 ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, 1178 engine); 1179 1180 if (HAS_PPGTT(dev_priv)) { 1181 int i; 1182 1183 ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7); 1184 1185 if (IS_GEN(dev_priv, 6)) { 1186 ee->vm_info.pp_dir_base = 1187 ENGINE_READ(engine, RING_PP_DIR_BASE_READ); 1188 } else if (IS_GEN(dev_priv, 7)) { 1189 ee->vm_info.pp_dir_base = 1190 ENGINE_READ(engine, RING_PP_DIR_BASE); 1191 } else if (INTEL_GEN(dev_priv) >= 8) { 1192 u32 base = engine->mmio_base; 1193 1194 for (i = 0; i < 4; i++) { 1195 ee->vm_info.pdp[i] = 1196 I915_READ(GEN8_RING_PDP_UDW(base, i)); 1197 ee->vm_info.pdp[i] <<= 32; 1198 ee->vm_info.pdp[i] |= 1199 I915_READ(GEN8_RING_PDP_LDW(base, i)); 1200 } 1201 } 1202 } 1203 } 1204 1205 static void record_request(const struct i915_request *request, 1206 struct drm_i915_error_request *erq) 1207 { 1208 const struct i915_gem_context *ctx = request->gem_context; 1209 1210 erq->flags = request->fence.flags; 1211 erq->context = request->fence.context; 1212 erq->seqno = request->fence.seqno; 1213 erq->sched_attr = request->sched.attr; 1214 erq->jiffies = request->emitted_jiffies; 1215 erq->start = i915_ggtt_offset(request->ring->vma); 1216 erq->head = request->head; 1217 erq->tail = request->tail; 1218 1219 rcu_read_lock(); 1220 erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0; 1221 rcu_read_unlock(); 1222 } 1223 1224 static void engine_record_requests(struct intel_engine_cs *engine, 1225 struct i915_request *first, 1226 struct drm_i915_error_engine *ee) 1227 { 1228 struct i915_request *request; 1229 int count; 1230 1231 count = 0; 1232 request = first; 1233 list_for_each_entry_from(request, &engine->active.requests, sched.link) 1234 count++; 1235 if (!count) 1236 return; 1237 1238 ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL); 1239 if (!ee->requests) 1240 return; 1241 1242 ee->num_requests = count; 1243 1244 count = 0; 1245 request = first; 1246 list_for_each_entry_from(request, 1247 &engine->active.requests, sched.link) { 1248 if (count >= ee->num_requests) { 1249 /* 1250 * If the ring request list was changed in 1251 * between the point where the error request 1252 * list was created and dimensioned and this 1253 * point then just exit early to avoid crashes. 1254 * 1255 * We don't need to communicate that the 1256 * request list changed state during error 1257 * state capture and that the error state is 1258 * slightly incorrect as a consequence since we 1259 * are typically only interested in the request 1260 * list state at the point of error state 1261 * capture, not in any changes happening during 1262 * the capture. 1263 */ 1264 break; 1265 } 1266 1267 record_request(request, &ee->requests[count++]); 1268 } 1269 ee->num_requests = count; 1270 } 1271 1272 static void error_record_engine_execlists(const struct intel_engine_cs *engine, 1273 struct drm_i915_error_engine *ee) 1274 { 1275 const struct intel_engine_execlists * const execlists = &engine->execlists; 1276 struct i915_request * const *port = execlists->active; 1277 unsigned int n = 0; 1278 1279 while (*port) 1280 record_request(*port++, &ee->execlist[n++]); 1281 1282 ee->num_ports = n; 1283 } 1284 1285 static void record_context(struct drm_i915_error_context *e, 1286 struct i915_gem_context *ctx) 1287 { 1288 if (ctx->pid) { 1289 struct task_struct *task; 1290 1291 rcu_read_lock(); 1292 task = pid_task(ctx->pid, PIDTYPE_PID); 1293 if (task) { 1294 strcpy(e->comm, task->comm); 1295 e->pid = task->pid; 1296 } 1297 rcu_read_unlock(); 1298 } 1299 1300 e->hw_id = ctx->hw_id; 1301 e->sched_attr = ctx->sched; 1302 e->guilty = atomic_read(&ctx->guilty_count); 1303 e->active = atomic_read(&ctx->active_count); 1304 } 1305 1306 struct capture_vma { 1307 struct capture_vma *next; 1308 void **slot; 1309 }; 1310 1311 static struct capture_vma * 1312 capture_vma(struct capture_vma *next, 1313 struct i915_vma *vma, 1314 struct drm_i915_error_object **out) 1315 { 1316 struct capture_vma *c; 1317 1318 *out = NULL; 1319 if (!vma) 1320 return next; 1321 1322 c = kmalloc(sizeof(*c), ATOMIC_MAYFAIL); 1323 if (!c) 1324 return next; 1325 1326 if (!i915_active_trygrab(&vma->active)) { 1327 kfree(c); 1328 return next; 1329 } 1330 1331 c->slot = (void **)out; 1332 *c->slot = i915_vma_get(vma); 1333 1334 c->next = next; 1335 return c; 1336 } 1337 1338 static struct capture_vma * 1339 request_record_user_bo(struct i915_request *request, 1340 struct drm_i915_error_engine *ee, 1341 struct capture_vma *capture) 1342 { 1343 struct i915_capture_list *c; 1344 struct drm_i915_error_object **bo; 1345 long count, max; 1346 1347 max = 0; 1348 for (c = request->capture_list; c; c = c->next) 1349 max++; 1350 if (!max) 1351 return capture; 1352 1353 bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); 1354 if (!bo) { 1355 /* If we can't capture everything, try to capture something. */ 1356 max = min_t(long, max, PAGE_SIZE / sizeof(*bo)); 1357 bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); 1358 } 1359 if (!bo) 1360 return capture; 1361 1362 count = 0; 1363 for (c = request->capture_list; c; c = c->next) { 1364 capture = capture_vma(capture, c->vma, &bo[count]); 1365 if (++count == max) 1366 break; 1367 } 1368 1369 ee->user_bo = bo; 1370 ee->user_bo_count = count; 1371 1372 return capture; 1373 } 1374 1375 static struct drm_i915_error_object * 1376 capture_object(struct drm_i915_private *dev_priv, 1377 struct drm_i915_gem_object *obj, 1378 struct compress *compress) 1379 { 1380 if (obj && i915_gem_object_has_pages(obj)) { 1381 struct i915_vma fake = { 1382 .node = { .start = U64_MAX, .size = obj->base.size }, 1383 .size = obj->base.size, 1384 .pages = obj->mm.pages, 1385 .obj = obj, 1386 }; 1387 1388 return i915_error_object_create(dev_priv, &fake, compress); 1389 } else { 1390 return NULL; 1391 } 1392 } 1393 1394 static void 1395 gem_record_rings(struct i915_gpu_state *error, struct compress *compress) 1396 { 1397 struct drm_i915_private *i915 = error->i915; 1398 int i; 1399 1400 for (i = 0; i < I915_NUM_ENGINES; i++) { 1401 struct intel_engine_cs *engine = i915->engine[i]; 1402 struct drm_i915_error_engine *ee = &error->engine[i]; 1403 struct capture_vma *capture = NULL; 1404 struct i915_request *request; 1405 unsigned long flags; 1406 1407 ee->engine_id = -1; 1408 1409 if (!engine) 1410 continue; 1411 1412 ee->engine_id = i; 1413 1414 /* Refill our page pool before entering atomic section */ 1415 pool_refill(&compress->pool, ALLOW_FAIL); 1416 1417 error_record_engine_registers(error, engine, ee); 1418 error_record_engine_execlists(engine, ee); 1419 1420 spin_lock_irqsave(&engine->active.lock, flags); 1421 request = intel_engine_find_active_request(engine); 1422 if (request) { 1423 struct i915_gem_context *ctx = request->gem_context; 1424 struct intel_ring *ring = request->ring; 1425 1426 record_context(&ee->context, ctx); 1427 1428 /* 1429 * We need to copy these to an anonymous buffer 1430 * as the simplest method to avoid being overwritten 1431 * by userspace. 1432 */ 1433 capture = capture_vma(capture, 1434 request->batch, 1435 &ee->batchbuffer); 1436 1437 if (HAS_BROKEN_CS_TLB(i915)) 1438 capture = capture_vma(capture, 1439 engine->gt->scratch, 1440 &ee->wa_batchbuffer); 1441 1442 capture = request_record_user_bo(request, ee, capture); 1443 1444 capture = capture_vma(capture, 1445 request->hw_context->state, 1446 &ee->ctx); 1447 1448 capture = capture_vma(capture, 1449 ring->vma, 1450 &ee->ringbuffer); 1451 1452 error->simulated |= 1453 i915_gem_context_no_error_capture(ctx); 1454 1455 ee->rq_head = request->head; 1456 ee->rq_post = request->postfix; 1457 ee->rq_tail = request->tail; 1458 1459 ee->cpu_ring_head = ring->head; 1460 ee->cpu_ring_tail = ring->tail; 1461 1462 engine_record_requests(engine, request, ee); 1463 } 1464 spin_unlock_irqrestore(&engine->active.lock, flags); 1465 1466 while (capture) { 1467 struct capture_vma *this = capture; 1468 struct i915_vma *vma = *this->slot; 1469 1470 *this->slot = 1471 i915_error_object_create(i915, vma, compress); 1472 1473 i915_active_ungrab(&vma->active); 1474 i915_vma_put(vma); 1475 1476 capture = this->next; 1477 kfree(this); 1478 } 1479 1480 ee->hws_page = 1481 i915_error_object_create(i915, 1482 engine->status_page.vma, 1483 compress); 1484 1485 ee->wa_ctx = 1486 i915_error_object_create(i915, 1487 engine->wa_ctx.vma, 1488 compress); 1489 1490 ee->default_state = 1491 capture_object(i915, engine->default_state, compress); 1492 } 1493 } 1494 1495 static void 1496 capture_uc_state(struct i915_gpu_state *error, struct compress *compress) 1497 { 1498 struct drm_i915_private *i915 = error->i915; 1499 struct i915_error_uc *error_uc = &error->uc; 1500 struct intel_uc *uc = &i915->gt.uc; 1501 1502 /* Capturing uC state won't be useful if there is no GuC */ 1503 if (!error->device_info.has_gt_uc) 1504 return; 1505 1506 error_uc->guc_fw = uc->guc.fw; 1507 error_uc->huc_fw = uc->huc.fw; 1508 1509 /* Non-default firmware paths will be specified by the modparam. 1510 * As modparams are generally accesible from the userspace make 1511 * explicit copies of the firmware paths. 1512 */ 1513 error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL); 1514 error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL); 1515 error_uc->guc_log = i915_error_object_create(i915, 1516 uc->guc.log.vma, 1517 compress); 1518 } 1519 1520 /* Capture all registers which don't fit into another category. */ 1521 static void capture_reg_state(struct i915_gpu_state *error) 1522 { 1523 struct drm_i915_private *i915 = error->i915; 1524 struct intel_uncore *uncore = &i915->uncore; 1525 int i; 1526 1527 /* General organization 1528 * 1. Registers specific to a single generation 1529 * 2. Registers which belong to multiple generations 1530 * 3. Feature specific registers. 1531 * 4. Everything else 1532 * Please try to follow the order. 1533 */ 1534 1535 /* 1: Registers specific to a single generation */ 1536 if (IS_VALLEYVIEW(i915)) { 1537 error->gtier[0] = intel_uncore_read(uncore, GTIER); 1538 error->ier = intel_uncore_read(uncore, VLV_IER); 1539 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); 1540 } 1541 1542 if (IS_GEN(i915, 7)) 1543 error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); 1544 1545 if (INTEL_GEN(i915) >= 8) { 1546 error->fault_data0 = intel_uncore_read(uncore, 1547 GEN8_FAULT_TLB_DATA0); 1548 error->fault_data1 = intel_uncore_read(uncore, 1549 GEN8_FAULT_TLB_DATA1); 1550 } 1551 1552 if (IS_GEN(i915, 6)) { 1553 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); 1554 error->gab_ctl = intel_uncore_read(uncore, GAB_CTL); 1555 error->gfx_mode = intel_uncore_read(uncore, GFX_MODE); 1556 } 1557 1558 /* 2: Registers which belong to multiple generations */ 1559 if (INTEL_GEN(i915) >= 7) 1560 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); 1561 1562 if (INTEL_GEN(i915) >= 6) { 1563 error->derrmr = intel_uncore_read(uncore, DERRMR); 1564 error->error = intel_uncore_read(uncore, ERROR_GEN6); 1565 error->done_reg = intel_uncore_read(uncore, DONE_REG); 1566 } 1567 1568 if (INTEL_GEN(i915) >= 5) 1569 error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE)); 1570 1571 /* 3: Feature specific registers */ 1572 if (IS_GEN_RANGE(i915, 6, 7)) { 1573 error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); 1574 error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); 1575 } 1576 1577 /* 4: Everything else */ 1578 if (INTEL_GEN(i915) >= 11) { 1579 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); 1580 error->gtier[0] = 1581 intel_uncore_read(uncore, 1582 GEN11_RENDER_COPY_INTR_ENABLE); 1583 error->gtier[1] = 1584 intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE); 1585 error->gtier[2] = 1586 intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE); 1587 error->gtier[3] = 1588 intel_uncore_read(uncore, 1589 GEN11_GPM_WGBOXPERF_INTR_ENABLE); 1590 error->gtier[4] = 1591 intel_uncore_read(uncore, 1592 GEN11_CRYPTO_RSVD_INTR_ENABLE); 1593 error->gtier[5] = 1594 intel_uncore_read(uncore, 1595 GEN11_GUNIT_CSME_INTR_ENABLE); 1596 error->ngtier = 6; 1597 } else if (INTEL_GEN(i915) >= 8) { 1598 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); 1599 for (i = 0; i < 4; i++) 1600 error->gtier[i] = intel_uncore_read(uncore, 1601 GEN8_GT_IER(i)); 1602 error->ngtier = 4; 1603 } else if (HAS_PCH_SPLIT(i915)) { 1604 error->ier = intel_uncore_read(uncore, DEIER); 1605 error->gtier[0] = intel_uncore_read(uncore, GTIER); 1606 error->ngtier = 1; 1607 } else if (IS_GEN(i915, 2)) { 1608 error->ier = intel_uncore_read16(uncore, GEN2_IER); 1609 } else if (!IS_VALLEYVIEW(i915)) { 1610 error->ier = intel_uncore_read(uncore, GEN2_IER); 1611 } 1612 error->eir = intel_uncore_read(uncore, EIR); 1613 error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); 1614 } 1615 1616 static const char * 1617 error_msg(struct i915_gpu_state *error, 1618 intel_engine_mask_t engines, const char *msg) 1619 { 1620 int len; 1621 int i; 1622 1623 for (i = 0; i < ARRAY_SIZE(error->engine); i++) 1624 if (!error->engine[i].context.pid) 1625 engines &= ~BIT(i); 1626 1627 len = scnprintf(error->error_msg, sizeof(error->error_msg), 1628 "GPU HANG: ecode %d:%x:0x%08x", 1629 INTEL_GEN(error->i915), engines, 1630 i915_error_generate_code(error, engines)); 1631 if (engines) { 1632 /* Just show the first executing process, more is confusing */ 1633 i = __ffs(engines); 1634 len += scnprintf(error->error_msg + len, 1635 sizeof(error->error_msg) - len, 1636 ", in %s [%d]", 1637 error->engine[i].context.comm, 1638 error->engine[i].context.pid); 1639 } 1640 if (msg) 1641 len += scnprintf(error->error_msg + len, 1642 sizeof(error->error_msg) - len, 1643 ", %s", msg); 1644 1645 return error->error_msg; 1646 } 1647 1648 static void capture_gen_state(struct i915_gpu_state *error) 1649 { 1650 struct drm_i915_private *i915 = error->i915; 1651 1652 error->awake = i915->gt.awake; 1653 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count); 1654 error->suspended = i915->runtime_pm.suspended; 1655 1656 error->iommu = -1; 1657 #ifdef CONFIG_INTEL_IOMMU 1658 error->iommu = intel_iommu_gfx_mapped; 1659 #endif 1660 error->reset_count = i915_reset_count(&i915->gpu_error); 1661 error->suspend_count = i915->suspend_count; 1662 1663 memcpy(&error->device_info, 1664 INTEL_INFO(i915), 1665 sizeof(error->device_info)); 1666 memcpy(&error->runtime_info, 1667 RUNTIME_INFO(i915), 1668 sizeof(error->runtime_info)); 1669 error->driver_caps = i915->caps; 1670 } 1671 1672 static void capture_params(struct i915_gpu_state *error) 1673 { 1674 i915_params_copy(&error->params, &i915_modparams); 1675 } 1676 1677 static unsigned long capture_find_epoch(const struct i915_gpu_state *error) 1678 { 1679 unsigned long epoch = error->capture; 1680 int i; 1681 1682 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 1683 const struct drm_i915_error_engine *ee = &error->engine[i]; 1684 1685 if (ee->hangcheck_timestamp && 1686 time_before(ee->hangcheck_timestamp, epoch)) 1687 epoch = ee->hangcheck_timestamp; 1688 } 1689 1690 return epoch; 1691 } 1692 1693 static void capture_finish(struct i915_gpu_state *error) 1694 { 1695 struct i915_ggtt *ggtt = &error->i915->ggtt; 1696 const u64 slot = ggtt->error_capture.start; 1697 1698 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); 1699 } 1700 1701 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) 1702 1703 struct i915_gpu_state * 1704 i915_capture_gpu_state(struct drm_i915_private *i915) 1705 { 1706 struct i915_gpu_state *error; 1707 struct compress compress; 1708 1709 /* Check if GPU capture has been disabled */ 1710 error = READ_ONCE(i915->gpu_error.first_error); 1711 if (IS_ERR(error)) 1712 return error; 1713 1714 error = kzalloc(sizeof(*error), ALLOW_FAIL); 1715 if (!error) { 1716 i915_disable_error_state(i915, -ENOMEM); 1717 return ERR_PTR(-ENOMEM); 1718 } 1719 1720 if (!compress_init(&compress)) { 1721 kfree(error); 1722 i915_disable_error_state(i915, -ENOMEM); 1723 return ERR_PTR(-ENOMEM); 1724 } 1725 1726 kref_init(&error->ref); 1727 error->i915 = i915; 1728 1729 error->time = ktime_get_real(); 1730 error->boottime = ktime_get_boottime(); 1731 error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time); 1732 error->capture = jiffies; 1733 1734 capture_params(error); 1735 capture_gen_state(error); 1736 capture_uc_state(error, &compress); 1737 capture_reg_state(error); 1738 gem_record_fences(error); 1739 gem_record_rings(error, &compress); 1740 1741 error->overlay = intel_overlay_capture_error_state(i915); 1742 error->display = intel_display_capture_error_state(i915); 1743 1744 error->epoch = capture_find_epoch(error); 1745 1746 capture_finish(error); 1747 compress_fini(&compress); 1748 1749 return error; 1750 } 1751 1752 /** 1753 * i915_capture_error_state - capture an error record for later analysis 1754 * @i915: i915 device 1755 * @engine_mask: the mask of engines triggering the hang 1756 * @msg: a message to insert into the error capture header 1757 * 1758 * Should be called when an error is detected (either a hang or an error 1759 * interrupt) to capture error state from the time of the error. Fills 1760 * out a structure which becomes available in debugfs for user level tools 1761 * to pick up. 1762 */ 1763 void i915_capture_error_state(struct drm_i915_private *i915, 1764 intel_engine_mask_t engine_mask, 1765 const char *msg) 1766 { 1767 static bool warned; 1768 struct i915_gpu_state *error; 1769 unsigned long flags; 1770 1771 if (!i915_modparams.error_capture) 1772 return; 1773 1774 if (READ_ONCE(i915->gpu_error.first_error)) 1775 return; 1776 1777 error = i915_capture_gpu_state(i915); 1778 if (IS_ERR(error)) 1779 return; 1780 1781 dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg)); 1782 1783 if (!error->simulated) { 1784 spin_lock_irqsave(&i915->gpu_error.lock, flags); 1785 if (!i915->gpu_error.first_error) { 1786 i915->gpu_error.first_error = error; 1787 error = NULL; 1788 } 1789 spin_unlock_irqrestore(&i915->gpu_error.lock, flags); 1790 } 1791 1792 if (error) { 1793 __i915_gpu_state_free(&error->ref); 1794 return; 1795 } 1796 1797 if (!warned && 1798 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { 1799 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); 1800 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); 1801 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); 1802 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n"); 1803 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", 1804 i915->drm.primary->index); 1805 warned = true; 1806 } 1807 } 1808 1809 struct i915_gpu_state * 1810 i915_first_error_state(struct drm_i915_private *i915) 1811 { 1812 struct i915_gpu_state *error; 1813 1814 spin_lock_irq(&i915->gpu_error.lock); 1815 error = i915->gpu_error.first_error; 1816 if (!IS_ERR_OR_NULL(error)) 1817 i915_gpu_state_get(error); 1818 spin_unlock_irq(&i915->gpu_error.lock); 1819 1820 return error; 1821 } 1822 1823 void i915_reset_error_state(struct drm_i915_private *i915) 1824 { 1825 struct i915_gpu_state *error; 1826 1827 spin_lock_irq(&i915->gpu_error.lock); 1828 error = i915->gpu_error.first_error; 1829 if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */ 1830 i915->gpu_error.first_error = NULL; 1831 spin_unlock_irq(&i915->gpu_error.lock); 1832 1833 if (!IS_ERR_OR_NULL(error)) 1834 i915_gpu_state_put(error); 1835 } 1836 1837 void i915_disable_error_state(struct drm_i915_private *i915, int err) 1838 { 1839 spin_lock_irq(&i915->gpu_error.lock); 1840 if (!i915->gpu_error.first_error) 1841 i915->gpu_error.first_error = ERR_PTR(err); 1842 spin_unlock_irq(&i915->gpu_error.lock); 1843 } 1844