1 /* 2 * Copyright (c) 2008 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * Keith Packard <keithp@keithp.com> 26 * Mika Kuoppala <mika.kuoppala@intel.com> 27 * 28 */ 29 30 #include <linux/ascii85.h> 31 #include <linux/nmi.h> 32 #include <linux/scatterlist.h> 33 #include <linux/stop_machine.h> 34 #include <linux/utsname.h> 35 #include <linux/zlib.h> 36 37 #include <drm/drm_print.h> 38 39 #include "i915_gpu_error.h" 40 #include "i915_drv.h" 41 42 static inline const struct intel_engine_cs * 43 engine_lookup(const struct drm_i915_private *i915, unsigned int id) 44 { 45 if (id >= I915_NUM_ENGINES) 46 return NULL; 47 48 return i915->engine[id]; 49 } 50 51 static inline const char * 52 __engine_name(const struct intel_engine_cs *engine) 53 { 54 return engine ? engine->name : ""; 55 } 56 57 static const char * 58 engine_name(const struct drm_i915_private *i915, unsigned int id) 59 { 60 return __engine_name(engine_lookup(i915, id)); 61 } 62 63 static const char *tiling_flag(int tiling) 64 { 65 switch (tiling) { 66 default: 67 case I915_TILING_NONE: return ""; 68 case I915_TILING_X: return " X"; 69 case I915_TILING_Y: return " Y"; 70 } 71 } 72 73 static const char *dirty_flag(int dirty) 74 { 75 return dirty ? " dirty" : ""; 76 } 77 78 static const char *purgeable_flag(int purgeable) 79 { 80 return purgeable ? " purgeable" : ""; 81 } 82 83 static void __sg_set_buf(struct scatterlist *sg, 84 void *addr, unsigned int len, loff_t it) 85 { 86 sg->page_link = (unsigned long)virt_to_page(addr); 87 sg->offset = offset_in_page(addr); 88 sg->length = len; 89 sg->dma_address = it; 90 } 91 92 static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) 93 { 94 if (!len) 95 return false; 96 97 if (e->bytes + len + 1 <= e->size) 98 return true; 99 100 if (e->bytes) { 101 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter); 102 e->iter += e->bytes; 103 e->buf = NULL; 104 e->bytes = 0; 105 } 106 107 if (e->cur == e->end) { 108 struct scatterlist *sgl; 109 110 sgl = (typeof(sgl))__get_free_page(GFP_KERNEL); 111 if (!sgl) { 112 e->err = -ENOMEM; 113 return false; 114 } 115 116 if (e->cur) { 117 e->cur->offset = 0; 118 e->cur->length = 0; 119 e->cur->page_link = 120 (unsigned long)sgl | SG_CHAIN; 121 } else { 122 e->sgl = sgl; 123 } 124 125 e->cur = sgl; 126 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1; 127 } 128 129 e->size = ALIGN(len + 1, SZ_64K); 130 e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); 131 if (!e->buf) { 132 e->size = PAGE_ALIGN(len + 1); 133 e->buf = kmalloc(e->size, GFP_KERNEL); 134 } 135 if (!e->buf) { 136 e->err = -ENOMEM; 137 return false; 138 } 139 140 return true; 141 } 142 143 __printf(2, 0) 144 static void i915_error_vprintf(struct drm_i915_error_state_buf *e, 145 const char *fmt, va_list args) 146 { 147 va_list ap; 148 int len; 149 150 if (e->err) 151 return; 152 153 va_copy(ap, args); 154 len = vsnprintf(NULL, 0, fmt, ap); 155 va_end(ap); 156 if (len <= 0) { 157 e->err = len; 158 return; 159 } 160 161 if (!__i915_error_grow(e, len)) 162 return; 163 164 GEM_BUG_ON(e->bytes >= e->size); 165 len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args); 166 if (len < 0) { 167 e->err = len; 168 return; 169 } 170 e->bytes += len; 171 } 172 173 static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str) 174 { 175 unsigned len; 176 177 if (e->err || !str) 178 return; 179 180 len = strlen(str); 181 if (!__i915_error_grow(e, len)) 182 return; 183 184 GEM_BUG_ON(e->bytes + len > e->size); 185 memcpy(e->buf + e->bytes, str, len); 186 e->bytes += len; 187 } 188 189 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__) 190 #define err_puts(e, s) i915_error_puts(e, s) 191 192 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf) 193 { 194 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va); 195 } 196 197 static inline struct drm_printer 198 i915_error_printer(struct drm_i915_error_state_buf *e) 199 { 200 struct drm_printer p = { 201 .printfn = __i915_printfn_error, 202 .arg = e, 203 }; 204 return p; 205 } 206 207 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR 208 209 struct compress { 210 struct z_stream_s zstream; 211 void *tmp; 212 }; 213 214 static bool compress_init(struct compress *c) 215 { 216 struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream)); 217 218 zstream->workspace = 219 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), 220 GFP_ATOMIC | __GFP_NOWARN); 221 if (!zstream->workspace) 222 return false; 223 224 if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) { 225 kfree(zstream->workspace); 226 return false; 227 } 228 229 c->tmp = NULL; 230 if (i915_has_memcpy_from_wc()) 231 c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN); 232 233 return true; 234 } 235 236 static void *compress_next_page(struct drm_i915_error_object *dst) 237 { 238 unsigned long page; 239 240 if (dst->page_count >= dst->num_pages) 241 return ERR_PTR(-ENOSPC); 242 243 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); 244 if (!page) 245 return ERR_PTR(-ENOMEM); 246 247 return dst->pages[dst->page_count++] = (void *)page; 248 } 249 250 static int compress_page(struct compress *c, 251 void *src, 252 struct drm_i915_error_object *dst) 253 { 254 struct z_stream_s *zstream = &c->zstream; 255 256 zstream->next_in = src; 257 if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) 258 zstream->next_in = c->tmp; 259 zstream->avail_in = PAGE_SIZE; 260 261 do { 262 if (zstream->avail_out == 0) { 263 zstream->next_out = compress_next_page(dst); 264 if (IS_ERR(zstream->next_out)) 265 return PTR_ERR(zstream->next_out); 266 267 zstream->avail_out = PAGE_SIZE; 268 } 269 270 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) 271 return -EIO; 272 273 touch_nmi_watchdog(); 274 } while (zstream->avail_in); 275 276 /* Fallback to uncompressed if we increase size? */ 277 if (0 && zstream->total_out > zstream->total_in) 278 return -E2BIG; 279 280 return 0; 281 } 282 283 static int compress_flush(struct compress *c, 284 struct drm_i915_error_object *dst) 285 { 286 struct z_stream_s *zstream = &c->zstream; 287 288 do { 289 switch (zlib_deflate(zstream, Z_FINISH)) { 290 case Z_OK: /* more space requested */ 291 zstream->next_out = compress_next_page(dst); 292 if (IS_ERR(zstream->next_out)) 293 return PTR_ERR(zstream->next_out); 294 295 zstream->avail_out = PAGE_SIZE; 296 break; 297 298 case Z_STREAM_END: 299 goto end; 300 301 default: /* any error */ 302 return -EIO; 303 } 304 } while (1); 305 306 end: 307 memset(zstream->next_out, 0, zstream->avail_out); 308 dst->unused = zstream->avail_out; 309 return 0; 310 } 311 312 static void compress_fini(struct compress *c, 313 struct drm_i915_error_object *dst) 314 { 315 struct z_stream_s *zstream = &c->zstream; 316 317 zlib_deflateEnd(zstream); 318 kfree(zstream->workspace); 319 if (c->tmp) 320 free_page((unsigned long)c->tmp); 321 } 322 323 static void err_compression_marker(struct drm_i915_error_state_buf *m) 324 { 325 err_puts(m, ":"); 326 } 327 328 #else 329 330 struct compress { 331 }; 332 333 static bool compress_init(struct compress *c) 334 { 335 return true; 336 } 337 338 static int compress_page(struct compress *c, 339 void *src, 340 struct drm_i915_error_object *dst) 341 { 342 unsigned long page; 343 void *ptr; 344 345 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); 346 if (!page) 347 return -ENOMEM; 348 349 ptr = (void *)page; 350 if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE)) 351 memcpy(ptr, src, PAGE_SIZE); 352 dst->pages[dst->page_count++] = ptr; 353 354 return 0; 355 } 356 357 static int compress_flush(struct compress *c, 358 struct drm_i915_error_object *dst) 359 { 360 return 0; 361 } 362 363 static void compress_fini(struct compress *c, 364 struct drm_i915_error_object *dst) 365 { 366 } 367 368 static void err_compression_marker(struct drm_i915_error_state_buf *m) 369 { 370 err_puts(m, "~"); 371 } 372 373 #endif 374 375 static void print_error_buffers(struct drm_i915_error_state_buf *m, 376 const char *name, 377 struct drm_i915_error_buffer *err, 378 int count) 379 { 380 err_printf(m, "%s [%d]:\n", name, count); 381 382 while (count--) { 383 err_printf(m, " %08x_%08x %8u %02x %02x", 384 upper_32_bits(err->gtt_offset), 385 lower_32_bits(err->gtt_offset), 386 err->size, 387 err->read_domains, 388 err->write_domain); 389 err_puts(m, tiling_flag(err->tiling)); 390 err_puts(m, dirty_flag(err->dirty)); 391 err_puts(m, purgeable_flag(err->purgeable)); 392 err_puts(m, err->userptr ? " userptr" : ""); 393 err_puts(m, i915_cache_level_str(m->i915, err->cache_level)); 394 395 if (err->name) 396 err_printf(m, " (name: %d)", err->name); 397 if (err->fence_reg != I915_FENCE_REG_NONE) 398 err_printf(m, " (fence: %d)", err->fence_reg); 399 400 err_puts(m, "\n"); 401 err++; 402 } 403 } 404 405 static void error_print_instdone(struct drm_i915_error_state_buf *m, 406 const struct drm_i915_error_engine *ee) 407 { 408 int slice; 409 int subslice; 410 411 err_printf(m, " INSTDONE: 0x%08x\n", 412 ee->instdone.instdone); 413 414 if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3) 415 return; 416 417 err_printf(m, " SC_INSTDONE: 0x%08x\n", 418 ee->instdone.slice_common); 419 420 if (INTEL_GEN(m->i915) <= 6) 421 return; 422 423 for_each_instdone_slice_subslice(m->i915, slice, subslice) 424 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n", 425 slice, subslice, 426 ee->instdone.sampler[slice][subslice]); 427 428 for_each_instdone_slice_subslice(m->i915, slice, subslice) 429 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n", 430 slice, subslice, 431 ee->instdone.row[slice][subslice]); 432 } 433 434 static void error_print_request(struct drm_i915_error_state_buf *m, 435 const char *prefix, 436 const struct drm_i915_error_request *erq, 437 const unsigned long epoch) 438 { 439 if (!erq->seqno) 440 return; 441 442 err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", 443 prefix, erq->pid, erq->context, erq->seqno, 444 test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 445 &erq->flags) ? "!" : "", 446 test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 447 &erq->flags) ? "+" : "", 448 erq->sched_attr.priority, 449 jiffies_to_msecs(erq->jiffies - epoch), 450 erq->start, erq->head, erq->tail); 451 } 452 453 static void error_print_context(struct drm_i915_error_state_buf *m, 454 const char *header, 455 const struct drm_i915_error_context *ctx) 456 { 457 err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n", 458 header, ctx->comm, ctx->pid, ctx->hw_id, 459 ctx->sched_attr.priority, ctx->guilty, ctx->active); 460 } 461 462 static void error_print_engine(struct drm_i915_error_state_buf *m, 463 const struct drm_i915_error_engine *ee, 464 const unsigned long epoch) 465 { 466 int n; 467 468 err_printf(m, "%s command stream:\n", 469 engine_name(m->i915, ee->engine_id)); 470 err_printf(m, " IDLE?: %s\n", yesno(ee->idle)); 471 err_printf(m, " START: 0x%08x\n", ee->start); 472 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head); 473 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n", 474 ee->tail, ee->rq_post, ee->rq_tail); 475 err_printf(m, " CTL: 0x%08x\n", ee->ctl); 476 err_printf(m, " MODE: 0x%08x\n", ee->mode); 477 err_printf(m, " HWS: 0x%08x\n", ee->hws); 478 err_printf(m, " ACTHD: 0x%08x %08x\n", 479 (u32)(ee->acthd>>32), (u32)ee->acthd); 480 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir); 481 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr); 482 483 error_print_instdone(m, ee); 484 485 if (ee->batchbuffer) { 486 u64 start = ee->batchbuffer->gtt_offset; 487 u64 end = start + ee->batchbuffer->gtt_size; 488 489 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n", 490 upper_32_bits(start), lower_32_bits(start), 491 upper_32_bits(end), lower_32_bits(end)); 492 } 493 if (INTEL_GEN(m->i915) >= 4) { 494 err_printf(m, " BBADDR: 0x%08x_%08x\n", 495 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr); 496 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate); 497 err_printf(m, " INSTPS: 0x%08x\n", ee->instps); 498 } 499 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm); 500 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr), 501 lower_32_bits(ee->faddr)); 502 if (INTEL_GEN(m->i915) >= 6) { 503 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi); 504 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg); 505 } 506 if (HAS_PPGTT(m->i915)) { 507 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode); 508 509 if (INTEL_GEN(m->i915) >= 8) { 510 int i; 511 for (i = 0; i < 4; i++) 512 err_printf(m, " PDP%d: 0x%016llx\n", 513 i, ee->vm_info.pdp[i]); 514 } else { 515 err_printf(m, " PP_DIR_BASE: 0x%08x\n", 516 ee->vm_info.pp_dir_base); 517 } 518 } 519 err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); 520 err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); 521 err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n", 522 jiffies_to_msecs(ee->hangcheck_timestamp - epoch), 523 ee->hangcheck_timestamp, 524 ee->hangcheck_timestamp == epoch ? "; epoch" : ""); 525 err_printf(m, " engine reset count: %u\n", ee->reset_count); 526 527 for (n = 0; n < ee->num_ports; n++) { 528 err_printf(m, " ELSP[%d]:", n); 529 error_print_request(m, " ", &ee->execlist[n], epoch); 530 } 531 532 error_print_context(m, " Active context: ", &ee->context); 533 } 534 535 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) 536 { 537 va_list args; 538 539 va_start(args, f); 540 i915_error_vprintf(e, f, args); 541 va_end(args); 542 } 543 544 static void print_error_obj(struct drm_i915_error_state_buf *m, 545 struct intel_engine_cs *engine, 546 const char *name, 547 struct drm_i915_error_object *obj) 548 { 549 char out[ASCII85_BUFSZ]; 550 int page; 551 552 if (!obj) 553 return; 554 555 if (name) { 556 err_printf(m, "%s --- %s = 0x%08x %08x\n", 557 engine ? engine->name : "global", name, 558 upper_32_bits(obj->gtt_offset), 559 lower_32_bits(obj->gtt_offset)); 560 } 561 562 err_compression_marker(m); 563 for (page = 0; page < obj->page_count; page++) { 564 int i, len; 565 566 len = PAGE_SIZE; 567 if (page == obj->page_count - 1) 568 len -= obj->unused; 569 len = ascii85_encode_len(len); 570 571 for (i = 0; i < len; i++) 572 err_puts(m, ascii85_encode(obj->pages[page][i], out)); 573 } 574 err_puts(m, "\n"); 575 } 576 577 static void err_print_capabilities(struct drm_i915_error_state_buf *m, 578 const struct intel_device_info *info, 579 const struct intel_runtime_info *runtime, 580 const struct intel_driver_caps *caps) 581 { 582 struct drm_printer p = i915_error_printer(m); 583 584 intel_device_info_dump_flags(info, &p); 585 intel_driver_caps_print(caps, &p); 586 intel_device_info_dump_topology(&runtime->sseu, &p); 587 } 588 589 static void err_print_params(struct drm_i915_error_state_buf *m, 590 const struct i915_params *params) 591 { 592 struct drm_printer p = i915_error_printer(m); 593 594 i915_params_dump(params, &p); 595 } 596 597 static void err_print_pciid(struct drm_i915_error_state_buf *m, 598 struct drm_i915_private *i915) 599 { 600 struct pci_dev *pdev = i915->drm.pdev; 601 602 err_printf(m, "PCI ID: 0x%04x\n", pdev->device); 603 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision); 604 err_printf(m, "PCI Subsystem: %04x:%04x\n", 605 pdev->subsystem_vendor, 606 pdev->subsystem_device); 607 } 608 609 static void err_print_uc(struct drm_i915_error_state_buf *m, 610 const struct i915_error_uc *error_uc) 611 { 612 struct drm_printer p = i915_error_printer(m); 613 const struct i915_gpu_state *error = 614 container_of(error_uc, typeof(*error), uc); 615 616 if (!error->device_info.has_guc) 617 return; 618 619 intel_uc_fw_dump(&error_uc->guc_fw, &p); 620 intel_uc_fw_dump(&error_uc->huc_fw, &p); 621 print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log); 622 } 623 624 static void err_free_sgl(struct scatterlist *sgl) 625 { 626 while (sgl) { 627 struct scatterlist *sg; 628 629 for (sg = sgl; !sg_is_chain(sg); sg++) { 630 kfree(sg_virt(sg)); 631 if (sg_is_last(sg)) 632 break; 633 } 634 635 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg); 636 free_page((unsigned long)sgl); 637 sgl = sg; 638 } 639 } 640 641 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, 642 struct i915_gpu_state *error) 643 { 644 struct drm_i915_error_object *obj; 645 struct timespec64 ts; 646 int i, j; 647 648 if (*error->error_msg) 649 err_printf(m, "%s\n", error->error_msg); 650 err_printf(m, "Kernel: %s %s\n", 651 init_utsname()->release, 652 init_utsname()->machine); 653 ts = ktime_to_timespec64(error->time); 654 err_printf(m, "Time: %lld s %ld us\n", 655 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 656 ts = ktime_to_timespec64(error->boottime); 657 err_printf(m, "Boottime: %lld s %ld us\n", 658 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 659 ts = ktime_to_timespec64(error->uptime); 660 err_printf(m, "Uptime: %lld s %ld us\n", 661 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 662 err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ); 663 err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n", 664 error->capture, 665 jiffies_to_msecs(jiffies - error->capture), 666 jiffies_to_msecs(error->capture - error->epoch)); 667 668 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 669 if (!error->engine[i].context.pid) 670 continue; 671 672 err_printf(m, "Active process (on ring %s): %s [%d]\n", 673 engine_name(m->i915, i), 674 error->engine[i].context.comm, 675 error->engine[i].context.pid); 676 } 677 err_printf(m, "Reset count: %u\n", error->reset_count); 678 err_printf(m, "Suspend count: %u\n", error->suspend_count); 679 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); 680 err_printf(m, "Subplatform: 0x%x\n", 681 intel_subplatform(&error->runtime_info, 682 error->device_info.platform)); 683 err_print_pciid(m, m->i915); 684 685 err_printf(m, "IOMMU enabled?: %d\n", error->iommu); 686 687 if (HAS_CSR(m->i915)) { 688 struct intel_csr *csr = &m->i915->csr; 689 690 err_printf(m, "DMC loaded: %s\n", 691 yesno(csr->dmc_payload != NULL)); 692 err_printf(m, "DMC fw version: %d.%d\n", 693 CSR_VERSION_MAJOR(csr->version), 694 CSR_VERSION_MINOR(csr->version)); 695 } 696 697 err_printf(m, "GT awake: %s\n", yesno(error->awake)); 698 err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock)); 699 err_printf(m, "PM suspended: %s\n", yesno(error->suspended)); 700 err_printf(m, "EIR: 0x%08x\n", error->eir); 701 err_printf(m, "IER: 0x%08x\n", error->ier); 702 for (i = 0; i < error->ngtier; i++) 703 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); 704 err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); 705 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); 706 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); 707 err_printf(m, "CCID: 0x%08x\n", error->ccid); 708 709 for (i = 0; i < error->nfence; i++) 710 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); 711 712 if (INTEL_GEN(m->i915) >= 6) { 713 err_printf(m, "ERROR: 0x%08x\n", error->error); 714 715 if (INTEL_GEN(m->i915) >= 8) 716 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", 717 error->fault_data1, error->fault_data0); 718 719 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg); 720 } 721 722 if (IS_GEN(m->i915, 7)) 723 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); 724 725 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 726 if (error->engine[i].engine_id != -1) 727 error_print_engine(m, &error->engine[i], error->epoch); 728 } 729 730 for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) { 731 char buf[128]; 732 int len, first = 1; 733 734 if (!error->active_vm[i]) 735 break; 736 737 len = scnprintf(buf, sizeof(buf), "Active ("); 738 for (j = 0; j < ARRAY_SIZE(error->engine); j++) { 739 if (error->engine[j].vm != error->active_vm[i]) 740 continue; 741 742 len += scnprintf(buf + len, sizeof(buf), "%s%s", 743 first ? "" : ", ", 744 m->i915->engine[j]->name); 745 first = 0; 746 } 747 scnprintf(buf + len, sizeof(buf), ")"); 748 print_error_buffers(m, buf, 749 error->active_bo[i], 750 error->active_bo_count[i]); 751 } 752 753 print_error_buffers(m, "Pinned (global)", 754 error->pinned_bo, 755 error->pinned_bo_count); 756 757 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 758 const struct drm_i915_error_engine *ee = &error->engine[i]; 759 760 obj = ee->batchbuffer; 761 if (obj) { 762 err_puts(m, m->i915->engine[i]->name); 763 if (ee->context.pid) 764 err_printf(m, " (submitted by %s [%d])", 765 ee->context.comm, 766 ee->context.pid); 767 err_printf(m, " --- gtt_offset = 0x%08x %08x\n", 768 upper_32_bits(obj->gtt_offset), 769 lower_32_bits(obj->gtt_offset)); 770 print_error_obj(m, m->i915->engine[i], NULL, obj); 771 } 772 773 for (j = 0; j < ee->user_bo_count; j++) 774 print_error_obj(m, m->i915->engine[i], 775 "user", ee->user_bo[j]); 776 777 if (ee->num_requests) { 778 err_printf(m, "%s --- %d requests\n", 779 m->i915->engine[i]->name, 780 ee->num_requests); 781 for (j = 0; j < ee->num_requests; j++) 782 error_print_request(m, " ", 783 &ee->requests[j], 784 error->epoch); 785 } 786 787 print_error_obj(m, m->i915->engine[i], 788 "ringbuffer", ee->ringbuffer); 789 790 print_error_obj(m, m->i915->engine[i], 791 "HW Status", ee->hws_page); 792 793 print_error_obj(m, m->i915->engine[i], 794 "HW context", ee->ctx); 795 796 print_error_obj(m, m->i915->engine[i], 797 "WA context", ee->wa_ctx); 798 799 print_error_obj(m, m->i915->engine[i], 800 "WA batchbuffer", ee->wa_batchbuffer); 801 802 print_error_obj(m, m->i915->engine[i], 803 "NULL context", ee->default_state); 804 } 805 806 if (error->overlay) 807 intel_overlay_print_error_state(m, error->overlay); 808 809 if (error->display) 810 intel_display_print_error_state(m, error->display); 811 812 err_print_capabilities(m, &error->device_info, &error->runtime_info, 813 &error->driver_caps); 814 err_print_params(m, &error->params); 815 err_print_uc(m, &error->uc); 816 } 817 818 static int err_print_to_sgl(struct i915_gpu_state *error) 819 { 820 struct drm_i915_error_state_buf m; 821 822 if (IS_ERR(error)) 823 return PTR_ERR(error); 824 825 if (READ_ONCE(error->sgl)) 826 return 0; 827 828 memset(&m, 0, sizeof(m)); 829 m.i915 = error->i915; 830 831 __err_print_to_sgl(&m, error); 832 833 if (m.buf) { 834 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter); 835 m.bytes = 0; 836 m.buf = NULL; 837 } 838 if (m.cur) { 839 GEM_BUG_ON(m.end < m.cur); 840 sg_mark_end(m.cur - 1); 841 } 842 GEM_BUG_ON(m.sgl && !m.cur); 843 844 if (m.err) { 845 err_free_sgl(m.sgl); 846 return m.err; 847 } 848 849 if (cmpxchg(&error->sgl, NULL, m.sgl)) 850 err_free_sgl(m.sgl); 851 852 return 0; 853 } 854 855 ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, 856 char *buf, loff_t off, size_t rem) 857 { 858 struct scatterlist *sg; 859 size_t count; 860 loff_t pos; 861 int err; 862 863 if (!error || !rem) 864 return 0; 865 866 err = err_print_to_sgl(error); 867 if (err) 868 return err; 869 870 sg = READ_ONCE(error->fit); 871 if (!sg || off < sg->dma_address) 872 sg = error->sgl; 873 if (!sg) 874 return 0; 875 876 pos = sg->dma_address; 877 count = 0; 878 do { 879 size_t len, start; 880 881 if (sg_is_chain(sg)) { 882 sg = sg_chain_ptr(sg); 883 GEM_BUG_ON(sg_is_chain(sg)); 884 } 885 886 len = sg->length; 887 if (pos + len <= off) { 888 pos += len; 889 continue; 890 } 891 892 start = sg->offset; 893 if (pos < off) { 894 GEM_BUG_ON(off - pos > len); 895 len -= off - pos; 896 start += off - pos; 897 pos = off; 898 } 899 900 len = min(len, rem); 901 GEM_BUG_ON(!len || len > sg->length); 902 903 memcpy(buf, page_address(sg_page(sg)) + start, len); 904 905 count += len; 906 pos += len; 907 908 buf += len; 909 rem -= len; 910 if (!rem) { 911 WRITE_ONCE(error->fit, sg); 912 break; 913 } 914 } while (!sg_is_last(sg++)); 915 916 return count; 917 } 918 919 static void i915_error_object_free(struct drm_i915_error_object *obj) 920 { 921 int page; 922 923 if (obj == NULL) 924 return; 925 926 for (page = 0; page < obj->page_count; page++) 927 free_page((unsigned long)obj->pages[page]); 928 929 kfree(obj); 930 } 931 932 933 static void cleanup_params(struct i915_gpu_state *error) 934 { 935 i915_params_free(&error->params); 936 } 937 938 static void cleanup_uc_state(struct i915_gpu_state *error) 939 { 940 struct i915_error_uc *error_uc = &error->uc; 941 942 kfree(error_uc->guc_fw.path); 943 kfree(error_uc->huc_fw.path); 944 i915_error_object_free(error_uc->guc_log); 945 } 946 947 void __i915_gpu_state_free(struct kref *error_ref) 948 { 949 struct i915_gpu_state *error = 950 container_of(error_ref, typeof(*error), ref); 951 long i, j; 952 953 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 954 struct drm_i915_error_engine *ee = &error->engine[i]; 955 956 for (j = 0; j < ee->user_bo_count; j++) 957 i915_error_object_free(ee->user_bo[j]); 958 kfree(ee->user_bo); 959 960 i915_error_object_free(ee->batchbuffer); 961 i915_error_object_free(ee->wa_batchbuffer); 962 i915_error_object_free(ee->ringbuffer); 963 i915_error_object_free(ee->hws_page); 964 i915_error_object_free(ee->ctx); 965 i915_error_object_free(ee->wa_ctx); 966 967 kfree(ee->requests); 968 } 969 970 for (i = 0; i < ARRAY_SIZE(error->active_bo); i++) 971 kfree(error->active_bo[i]); 972 kfree(error->pinned_bo); 973 974 kfree(error->overlay); 975 kfree(error->display); 976 977 cleanup_params(error); 978 cleanup_uc_state(error); 979 980 err_free_sgl(error->sgl); 981 kfree(error); 982 } 983 984 static struct drm_i915_error_object * 985 i915_error_object_create(struct drm_i915_private *i915, 986 struct i915_vma *vma) 987 { 988 struct i915_ggtt *ggtt = &i915->ggtt; 989 const u64 slot = ggtt->error_capture.start; 990 struct drm_i915_error_object *dst; 991 struct compress compress; 992 unsigned long num_pages; 993 struct sgt_iter iter; 994 dma_addr_t dma; 995 int ret; 996 997 if (!vma || !vma->pages) 998 return NULL; 999 1000 num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; 1001 num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */ 1002 dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), 1003 GFP_ATOMIC | __GFP_NOWARN); 1004 if (!dst) 1005 return NULL; 1006 1007 dst->gtt_offset = vma->node.start; 1008 dst->gtt_size = vma->node.size; 1009 dst->num_pages = num_pages; 1010 dst->page_count = 0; 1011 dst->unused = 0; 1012 1013 if (!compress_init(&compress)) { 1014 kfree(dst); 1015 return NULL; 1016 } 1017 1018 ret = -EINVAL; 1019 for_each_sgt_dma(dma, iter, vma->pages) { 1020 void __iomem *s; 1021 1022 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); 1023 1024 s = io_mapping_map_atomic_wc(&ggtt->iomap, slot); 1025 ret = compress_page(&compress, (void __force *)s, dst); 1026 io_mapping_unmap_atomic(s); 1027 if (ret) 1028 break; 1029 } 1030 1031 if (ret || compress_flush(&compress, dst)) { 1032 while (dst->page_count--) 1033 free_page((unsigned long)dst->pages[dst->page_count]); 1034 kfree(dst); 1035 dst = NULL; 1036 } 1037 1038 compress_fini(&compress, dst); 1039 return dst; 1040 } 1041 1042 static void capture_bo(struct drm_i915_error_buffer *err, 1043 struct i915_vma *vma) 1044 { 1045 struct drm_i915_gem_object *obj = vma->obj; 1046 1047 err->size = obj->base.size; 1048 err->name = obj->base.name; 1049 1050 err->gtt_offset = vma->node.start; 1051 err->read_domains = obj->read_domains; 1052 err->write_domain = obj->write_domain; 1053 err->fence_reg = vma->fence ? vma->fence->id : -1; 1054 err->tiling = i915_gem_object_get_tiling(obj); 1055 err->dirty = obj->mm.dirty; 1056 err->purgeable = obj->mm.madv != I915_MADV_WILLNEED; 1057 err->userptr = obj->userptr.mm != NULL; 1058 err->cache_level = obj->cache_level; 1059 } 1060 1061 static u32 capture_error_bo(struct drm_i915_error_buffer *err, 1062 int count, struct list_head *head, 1063 unsigned int flags) 1064 #define ACTIVE_ONLY BIT(0) 1065 #define PINNED_ONLY BIT(1) 1066 { 1067 struct i915_vma *vma; 1068 int i = 0; 1069 1070 list_for_each_entry(vma, head, vm_link) { 1071 if (!vma->obj) 1072 continue; 1073 1074 if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma)) 1075 continue; 1076 1077 if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma)) 1078 continue; 1079 1080 capture_bo(err++, vma); 1081 if (++i == count) 1082 break; 1083 } 1084 1085 return i; 1086 } 1087 1088 /* 1089 * Generate a semi-unique error code. The code is not meant to have meaning, The 1090 * code's only purpose is to try to prevent false duplicated bug reports by 1091 * grossly estimating a GPU error state. 1092 * 1093 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine 1094 * the hang if we could strip the GTT offset information from it. 1095 * 1096 * It's only a small step better than a random number in its current form. 1097 */ 1098 static u32 i915_error_generate_code(struct i915_gpu_state *error, 1099 intel_engine_mask_t engine_mask) 1100 { 1101 /* 1102 * IPEHR would be an ideal way to detect errors, as it's the gross 1103 * measure of "the command that hung." However, has some very common 1104 * synchronization commands which almost always appear in the case 1105 * strictly a client bug. Use instdone to differentiate those some. 1106 */ 1107 if (engine_mask) { 1108 struct drm_i915_error_engine *ee = 1109 &error->engine[ffs(engine_mask)]; 1110 1111 return ee->ipehr ^ ee->instdone.instdone; 1112 } 1113 1114 return 0; 1115 } 1116 1117 static void gem_record_fences(struct i915_gpu_state *error) 1118 { 1119 struct drm_i915_private *dev_priv = error->i915; 1120 int i; 1121 1122 if (INTEL_GEN(dev_priv) >= 6) { 1123 for (i = 0; i < dev_priv->num_fence_regs; i++) 1124 error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i)); 1125 } else if (INTEL_GEN(dev_priv) >= 4) { 1126 for (i = 0; i < dev_priv->num_fence_regs; i++) 1127 error->fence[i] = I915_READ64(FENCE_REG_965_LO(i)); 1128 } else { 1129 for (i = 0; i < dev_priv->num_fence_regs; i++) 1130 error->fence[i] = I915_READ(FENCE_REG(i)); 1131 } 1132 error->nfence = i; 1133 } 1134 1135 static void error_record_engine_registers(struct i915_gpu_state *error, 1136 struct intel_engine_cs *engine, 1137 struct drm_i915_error_engine *ee) 1138 { 1139 struct drm_i915_private *dev_priv = engine->i915; 1140 1141 if (INTEL_GEN(dev_priv) >= 6) { 1142 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); 1143 if (INTEL_GEN(dev_priv) >= 8) 1144 ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG); 1145 else 1146 ee->fault_reg = I915_READ(RING_FAULT_REG(engine)); 1147 } 1148 1149 if (INTEL_GEN(dev_priv) >= 4) { 1150 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD); 1151 ee->ipeir = ENGINE_READ(engine, RING_IPEIR); 1152 ee->ipehr = ENGINE_READ(engine, RING_IPEHR); 1153 ee->instps = ENGINE_READ(engine, RING_INSTPS); 1154 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR); 1155 if (INTEL_GEN(dev_priv) >= 8) { 1156 ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32; 1157 ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32; 1158 } 1159 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE); 1160 } else { 1161 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX); 1162 ee->ipeir = ENGINE_READ(engine, IPEIR); 1163 ee->ipehr = ENGINE_READ(engine, IPEHR); 1164 } 1165 1166 intel_engine_get_instdone(engine, &ee->instdone); 1167 1168 ee->instpm = ENGINE_READ(engine, RING_INSTPM); 1169 ee->acthd = intel_engine_get_active_head(engine); 1170 ee->start = ENGINE_READ(engine, RING_START); 1171 ee->head = ENGINE_READ(engine, RING_HEAD); 1172 ee->tail = ENGINE_READ(engine, RING_TAIL); 1173 ee->ctl = ENGINE_READ(engine, RING_CTL); 1174 if (INTEL_GEN(dev_priv) > 2) 1175 ee->mode = ENGINE_READ(engine, RING_MI_MODE); 1176 1177 if (!HWS_NEEDS_PHYSICAL(dev_priv)) { 1178 i915_reg_t mmio; 1179 1180 if (IS_GEN(dev_priv, 7)) { 1181 switch (engine->id) { 1182 default: 1183 MISSING_CASE(engine->id); 1184 case RCS0: 1185 mmio = RENDER_HWS_PGA_GEN7; 1186 break; 1187 case BCS0: 1188 mmio = BLT_HWS_PGA_GEN7; 1189 break; 1190 case VCS0: 1191 mmio = BSD_HWS_PGA_GEN7; 1192 break; 1193 case VECS0: 1194 mmio = VEBOX_HWS_PGA_GEN7; 1195 break; 1196 } 1197 } else if (IS_GEN(engine->i915, 6)) { 1198 mmio = RING_HWS_PGA_GEN6(engine->mmio_base); 1199 } else { 1200 /* XXX: gen8 returns to sanity */ 1201 mmio = RING_HWS_PGA(engine->mmio_base); 1202 } 1203 1204 ee->hws = I915_READ(mmio); 1205 } 1206 1207 ee->idle = intel_engine_is_idle(engine); 1208 if (!ee->idle) 1209 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; 1210 ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, 1211 engine); 1212 1213 if (HAS_PPGTT(dev_priv)) { 1214 int i; 1215 1216 ee->vm_info.gfx_mode = I915_READ(RING_MODE_GEN7(engine)); 1217 1218 if (IS_GEN(dev_priv, 6)) { 1219 ee->vm_info.pp_dir_base = 1220 ENGINE_READ(engine, RING_PP_DIR_BASE_READ); 1221 } else if (IS_GEN(dev_priv, 7)) { 1222 ee->vm_info.pp_dir_base = 1223 ENGINE_READ(engine, RING_PP_DIR_BASE); 1224 } else if (INTEL_GEN(dev_priv) >= 8) { 1225 u32 base = engine->mmio_base; 1226 1227 for (i = 0; i < 4; i++) { 1228 ee->vm_info.pdp[i] = 1229 I915_READ(GEN8_RING_PDP_UDW(base, i)); 1230 ee->vm_info.pdp[i] <<= 32; 1231 ee->vm_info.pdp[i] |= 1232 I915_READ(GEN8_RING_PDP_LDW(base, i)); 1233 } 1234 } 1235 } 1236 } 1237 1238 static void record_request(struct i915_request *request, 1239 struct drm_i915_error_request *erq) 1240 { 1241 struct i915_gem_context *ctx = request->gem_context; 1242 1243 erq->flags = request->fence.flags; 1244 erq->context = request->fence.context; 1245 erq->seqno = request->fence.seqno; 1246 erq->sched_attr = request->sched.attr; 1247 erq->jiffies = request->emitted_jiffies; 1248 erq->start = i915_ggtt_offset(request->ring->vma); 1249 erq->head = request->head; 1250 erq->tail = request->tail; 1251 1252 rcu_read_lock(); 1253 erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0; 1254 rcu_read_unlock(); 1255 } 1256 1257 static void engine_record_requests(struct intel_engine_cs *engine, 1258 struct i915_request *first, 1259 struct drm_i915_error_engine *ee) 1260 { 1261 struct i915_request *request; 1262 int count; 1263 1264 count = 0; 1265 request = first; 1266 list_for_each_entry_from(request, &engine->timeline.requests, link) 1267 count++; 1268 if (!count) 1269 return; 1270 1271 ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC); 1272 if (!ee->requests) 1273 return; 1274 1275 ee->num_requests = count; 1276 1277 count = 0; 1278 request = first; 1279 list_for_each_entry_from(request, &engine->timeline.requests, link) { 1280 if (count >= ee->num_requests) { 1281 /* 1282 * If the ring request list was changed in 1283 * between the point where the error request 1284 * list was created and dimensioned and this 1285 * point then just exit early to avoid crashes. 1286 * 1287 * We don't need to communicate that the 1288 * request list changed state during error 1289 * state capture and that the error state is 1290 * slightly incorrect as a consequence since we 1291 * are typically only interested in the request 1292 * list state at the point of error state 1293 * capture, not in any changes happening during 1294 * the capture. 1295 */ 1296 break; 1297 } 1298 1299 record_request(request, &ee->requests[count++]); 1300 } 1301 ee->num_requests = count; 1302 } 1303 1304 static void error_record_engine_execlists(struct intel_engine_cs *engine, 1305 struct drm_i915_error_engine *ee) 1306 { 1307 const struct intel_engine_execlists * const execlists = &engine->execlists; 1308 unsigned int n; 1309 1310 for (n = 0; n < execlists_num_ports(execlists); n++) { 1311 struct i915_request *rq = port_request(&execlists->port[n]); 1312 1313 if (!rq) 1314 break; 1315 1316 record_request(rq, &ee->execlist[n]); 1317 } 1318 1319 ee->num_ports = n; 1320 } 1321 1322 static void record_context(struct drm_i915_error_context *e, 1323 struct i915_gem_context *ctx) 1324 { 1325 if (ctx->pid) { 1326 struct task_struct *task; 1327 1328 rcu_read_lock(); 1329 task = pid_task(ctx->pid, PIDTYPE_PID); 1330 if (task) { 1331 strcpy(e->comm, task->comm); 1332 e->pid = task->pid; 1333 } 1334 rcu_read_unlock(); 1335 } 1336 1337 e->hw_id = ctx->hw_id; 1338 e->sched_attr = ctx->sched; 1339 e->guilty = atomic_read(&ctx->guilty_count); 1340 e->active = atomic_read(&ctx->active_count); 1341 } 1342 1343 static void request_record_user_bo(struct i915_request *request, 1344 struct drm_i915_error_engine *ee) 1345 { 1346 struct i915_capture_list *c; 1347 struct drm_i915_error_object **bo; 1348 long count, max; 1349 1350 max = 0; 1351 for (c = request->capture_list; c; c = c->next) 1352 max++; 1353 if (!max) 1354 return; 1355 1356 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC); 1357 if (!bo) { 1358 /* If we can't capture everything, try to capture something. */ 1359 max = min_t(long, max, PAGE_SIZE / sizeof(*bo)); 1360 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC); 1361 } 1362 if (!bo) 1363 return; 1364 1365 count = 0; 1366 for (c = request->capture_list; c; c = c->next) { 1367 bo[count] = i915_error_object_create(request->i915, c->vma); 1368 if (!bo[count]) 1369 break; 1370 if (++count == max) 1371 break; 1372 } 1373 1374 ee->user_bo = bo; 1375 ee->user_bo_count = count; 1376 } 1377 1378 static struct drm_i915_error_object * 1379 capture_object(struct drm_i915_private *dev_priv, 1380 struct drm_i915_gem_object *obj) 1381 { 1382 if (obj && i915_gem_object_has_pages(obj)) { 1383 struct i915_vma fake = { 1384 .node = { .start = U64_MAX, .size = obj->base.size }, 1385 .size = obj->base.size, 1386 .pages = obj->mm.pages, 1387 .obj = obj, 1388 }; 1389 1390 return i915_error_object_create(dev_priv, &fake); 1391 } else { 1392 return NULL; 1393 } 1394 } 1395 1396 static void gem_record_rings(struct i915_gpu_state *error) 1397 { 1398 struct drm_i915_private *i915 = error->i915; 1399 struct i915_ggtt *ggtt = &i915->ggtt; 1400 int i; 1401 1402 for (i = 0; i < I915_NUM_ENGINES; i++) { 1403 struct intel_engine_cs *engine = i915->engine[i]; 1404 struct drm_i915_error_engine *ee = &error->engine[i]; 1405 struct i915_request *request; 1406 1407 ee->engine_id = -1; 1408 1409 if (!engine) 1410 continue; 1411 1412 ee->engine_id = i; 1413 1414 error_record_engine_registers(error, engine, ee); 1415 error_record_engine_execlists(engine, ee); 1416 1417 request = intel_engine_find_active_request(engine); 1418 if (request) { 1419 struct i915_gem_context *ctx = request->gem_context; 1420 struct intel_ring *ring; 1421 1422 ee->vm = ctx->ppgtt ? &ctx->ppgtt->vm : &ggtt->vm; 1423 1424 record_context(&ee->context, ctx); 1425 1426 /* We need to copy these to an anonymous buffer 1427 * as the simplest method to avoid being overwritten 1428 * by userspace. 1429 */ 1430 ee->batchbuffer = 1431 i915_error_object_create(i915, request->batch); 1432 1433 if (HAS_BROKEN_CS_TLB(i915)) 1434 ee->wa_batchbuffer = 1435 i915_error_object_create(i915, 1436 i915->gt.scratch); 1437 request_record_user_bo(request, ee); 1438 1439 ee->ctx = 1440 i915_error_object_create(i915, 1441 request->hw_context->state); 1442 1443 error->simulated |= 1444 i915_gem_context_no_error_capture(ctx); 1445 1446 ee->rq_head = request->head; 1447 ee->rq_post = request->postfix; 1448 ee->rq_tail = request->tail; 1449 1450 ring = request->ring; 1451 ee->cpu_ring_head = ring->head; 1452 ee->cpu_ring_tail = ring->tail; 1453 ee->ringbuffer = 1454 i915_error_object_create(i915, ring->vma); 1455 1456 engine_record_requests(engine, request, ee); 1457 } 1458 1459 ee->hws_page = 1460 i915_error_object_create(i915, 1461 engine->status_page.vma); 1462 1463 ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma); 1464 1465 ee->default_state = capture_object(i915, engine->default_state); 1466 } 1467 } 1468 1469 static void gem_capture_vm(struct i915_gpu_state *error, 1470 struct i915_address_space *vm, 1471 int idx) 1472 { 1473 struct drm_i915_error_buffer *active_bo; 1474 struct i915_vma *vma; 1475 int count; 1476 1477 count = 0; 1478 list_for_each_entry(vma, &vm->bound_list, vm_link) 1479 if (i915_vma_is_active(vma)) 1480 count++; 1481 1482 active_bo = NULL; 1483 if (count) 1484 active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC); 1485 if (active_bo) 1486 count = capture_error_bo(active_bo, 1487 count, &vm->bound_list, 1488 ACTIVE_ONLY); 1489 else 1490 count = 0; 1491 1492 error->active_vm[idx] = vm; 1493 error->active_bo[idx] = active_bo; 1494 error->active_bo_count[idx] = count; 1495 } 1496 1497 static void capture_active_buffers(struct i915_gpu_state *error) 1498 { 1499 int cnt = 0, i, j; 1500 1501 BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo)); 1502 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm)); 1503 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count)); 1504 1505 /* Scan each engine looking for unique active contexts/vm */ 1506 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 1507 struct drm_i915_error_engine *ee = &error->engine[i]; 1508 bool found; 1509 1510 if (!ee->vm) 1511 continue; 1512 1513 found = false; 1514 for (j = 0; j < i && !found; j++) 1515 found = error->engine[j].vm == ee->vm; 1516 if (!found) 1517 gem_capture_vm(error, ee->vm, cnt++); 1518 } 1519 } 1520 1521 static void capture_pinned_buffers(struct i915_gpu_state *error) 1522 { 1523 struct i915_address_space *vm = &error->i915->ggtt.vm; 1524 struct drm_i915_error_buffer *bo; 1525 struct i915_vma *vma; 1526 int count; 1527 1528 count = 0; 1529 list_for_each_entry(vma, &vm->bound_list, vm_link) 1530 count++; 1531 1532 bo = NULL; 1533 if (count) 1534 bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC); 1535 if (!bo) 1536 return; 1537 1538 error->pinned_bo_count = 1539 capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY); 1540 error->pinned_bo = bo; 1541 } 1542 1543 static void capture_uc_state(struct i915_gpu_state *error) 1544 { 1545 struct drm_i915_private *i915 = error->i915; 1546 struct i915_error_uc *error_uc = &error->uc; 1547 1548 /* Capturing uC state won't be useful if there is no GuC */ 1549 if (!error->device_info.has_guc) 1550 return; 1551 1552 error_uc->guc_fw = i915->guc.fw; 1553 error_uc->huc_fw = i915->huc.fw; 1554 1555 /* Non-default firmware paths will be specified by the modparam. 1556 * As modparams are generally accesible from the userspace make 1557 * explicit copies of the firmware paths. 1558 */ 1559 error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC); 1560 error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC); 1561 error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma); 1562 } 1563 1564 /* Capture all registers which don't fit into another category. */ 1565 static void capture_reg_state(struct i915_gpu_state *error) 1566 { 1567 struct drm_i915_private *dev_priv = error->i915; 1568 int i; 1569 1570 /* General organization 1571 * 1. Registers specific to a single generation 1572 * 2. Registers which belong to multiple generations 1573 * 3. Feature specific registers. 1574 * 4. Everything else 1575 * Please try to follow the order. 1576 */ 1577 1578 /* 1: Registers specific to a single generation */ 1579 if (IS_VALLEYVIEW(dev_priv)) { 1580 error->gtier[0] = I915_READ(GTIER); 1581 error->ier = I915_READ(VLV_IER); 1582 error->forcewake = I915_READ_FW(FORCEWAKE_VLV); 1583 } 1584 1585 if (IS_GEN(dev_priv, 7)) 1586 error->err_int = I915_READ(GEN7_ERR_INT); 1587 1588 if (INTEL_GEN(dev_priv) >= 8) { 1589 error->fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0); 1590 error->fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1); 1591 } 1592 1593 if (IS_GEN(dev_priv, 6)) { 1594 error->forcewake = I915_READ_FW(FORCEWAKE); 1595 error->gab_ctl = I915_READ(GAB_CTL); 1596 error->gfx_mode = I915_READ(GFX_MODE); 1597 } 1598 1599 /* 2: Registers which belong to multiple generations */ 1600 if (INTEL_GEN(dev_priv) >= 7) 1601 error->forcewake = I915_READ_FW(FORCEWAKE_MT); 1602 1603 if (INTEL_GEN(dev_priv) >= 6) { 1604 error->derrmr = I915_READ(DERRMR); 1605 error->error = I915_READ(ERROR_GEN6); 1606 error->done_reg = I915_READ(DONE_REG); 1607 } 1608 1609 if (INTEL_GEN(dev_priv) >= 5) 1610 error->ccid = I915_READ(CCID(RENDER_RING_BASE)); 1611 1612 /* 3: Feature specific registers */ 1613 if (IS_GEN_RANGE(dev_priv, 6, 7)) { 1614 error->gam_ecochk = I915_READ(GAM_ECOCHK); 1615 error->gac_eco = I915_READ(GAC_ECO_BITS); 1616 } 1617 1618 /* 4: Everything else */ 1619 if (INTEL_GEN(dev_priv) >= 11) { 1620 error->ier = I915_READ(GEN8_DE_MISC_IER); 1621 error->gtier[0] = I915_READ(GEN11_RENDER_COPY_INTR_ENABLE); 1622 error->gtier[1] = I915_READ(GEN11_VCS_VECS_INTR_ENABLE); 1623 error->gtier[2] = I915_READ(GEN11_GUC_SG_INTR_ENABLE); 1624 error->gtier[3] = I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE); 1625 error->gtier[4] = I915_READ(GEN11_CRYPTO_RSVD_INTR_ENABLE); 1626 error->gtier[5] = I915_READ(GEN11_GUNIT_CSME_INTR_ENABLE); 1627 error->ngtier = 6; 1628 } else if (INTEL_GEN(dev_priv) >= 8) { 1629 error->ier = I915_READ(GEN8_DE_MISC_IER); 1630 for (i = 0; i < 4; i++) 1631 error->gtier[i] = I915_READ(GEN8_GT_IER(i)); 1632 error->ngtier = 4; 1633 } else if (HAS_PCH_SPLIT(dev_priv)) { 1634 error->ier = I915_READ(DEIER); 1635 error->gtier[0] = I915_READ(GTIER); 1636 error->ngtier = 1; 1637 } else if (IS_GEN(dev_priv, 2)) { 1638 error->ier = I915_READ16(GEN2_IER); 1639 } else if (!IS_VALLEYVIEW(dev_priv)) { 1640 error->ier = I915_READ(GEN2_IER); 1641 } 1642 error->eir = I915_READ(EIR); 1643 error->pgtbl_er = I915_READ(PGTBL_ER); 1644 } 1645 1646 static const char * 1647 error_msg(struct i915_gpu_state *error, 1648 intel_engine_mask_t engines, const char *msg) 1649 { 1650 int len; 1651 int i; 1652 1653 for (i = 0; i < ARRAY_SIZE(error->engine); i++) 1654 if (!error->engine[i].context.pid) 1655 engines &= ~BIT(i); 1656 1657 len = scnprintf(error->error_msg, sizeof(error->error_msg), 1658 "GPU HANG: ecode %d:%x:0x%08x", 1659 INTEL_GEN(error->i915), engines, 1660 i915_error_generate_code(error, engines)); 1661 if (engines) { 1662 /* Just show the first executing process, more is confusing */ 1663 i = __ffs(engines); 1664 len += scnprintf(error->error_msg + len, 1665 sizeof(error->error_msg) - len, 1666 ", in %s [%d]", 1667 error->engine[i].context.comm, 1668 error->engine[i].context.pid); 1669 } 1670 if (msg) 1671 len += scnprintf(error->error_msg + len, 1672 sizeof(error->error_msg) - len, 1673 ", %s", msg); 1674 1675 return error->error_msg; 1676 } 1677 1678 static void capture_gen_state(struct i915_gpu_state *error) 1679 { 1680 struct drm_i915_private *i915 = error->i915; 1681 1682 error->awake = i915->gt.awake; 1683 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count); 1684 error->suspended = i915->runtime_pm.suspended; 1685 1686 error->iommu = -1; 1687 #ifdef CONFIG_INTEL_IOMMU 1688 error->iommu = intel_iommu_gfx_mapped; 1689 #endif 1690 error->reset_count = i915_reset_count(&i915->gpu_error); 1691 error->suspend_count = i915->suspend_count; 1692 1693 memcpy(&error->device_info, 1694 INTEL_INFO(i915), 1695 sizeof(error->device_info)); 1696 memcpy(&error->runtime_info, 1697 RUNTIME_INFO(i915), 1698 sizeof(error->runtime_info)); 1699 error->driver_caps = i915->caps; 1700 } 1701 1702 static void capture_params(struct i915_gpu_state *error) 1703 { 1704 i915_params_copy(&error->params, &i915_modparams); 1705 } 1706 1707 static unsigned long capture_find_epoch(const struct i915_gpu_state *error) 1708 { 1709 unsigned long epoch = error->capture; 1710 int i; 1711 1712 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 1713 const struct drm_i915_error_engine *ee = &error->engine[i]; 1714 1715 if (ee->hangcheck_timestamp && 1716 time_before(ee->hangcheck_timestamp, epoch)) 1717 epoch = ee->hangcheck_timestamp; 1718 } 1719 1720 return epoch; 1721 } 1722 1723 static void capture_finish(struct i915_gpu_state *error) 1724 { 1725 struct i915_ggtt *ggtt = &error->i915->ggtt; 1726 const u64 slot = ggtt->error_capture.start; 1727 1728 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); 1729 } 1730 1731 static int capture(void *data) 1732 { 1733 struct i915_gpu_state *error = data; 1734 1735 error->time = ktime_get_real(); 1736 error->boottime = ktime_get_boottime(); 1737 error->uptime = ktime_sub(ktime_get(), 1738 error->i915->gt.last_init_time); 1739 error->capture = jiffies; 1740 1741 capture_params(error); 1742 capture_gen_state(error); 1743 capture_uc_state(error); 1744 capture_reg_state(error); 1745 gem_record_fences(error); 1746 gem_record_rings(error); 1747 capture_active_buffers(error); 1748 capture_pinned_buffers(error); 1749 1750 error->overlay = intel_overlay_capture_error_state(error->i915); 1751 error->display = intel_display_capture_error_state(error->i915); 1752 1753 error->epoch = capture_find_epoch(error); 1754 1755 capture_finish(error); 1756 return 0; 1757 } 1758 1759 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) 1760 1761 struct i915_gpu_state * 1762 i915_capture_gpu_state(struct drm_i915_private *i915) 1763 { 1764 struct i915_gpu_state *error; 1765 1766 /* Check if GPU capture has been disabled */ 1767 error = READ_ONCE(i915->gpu_error.first_error); 1768 if (IS_ERR(error)) 1769 return error; 1770 1771 error = kzalloc(sizeof(*error), GFP_ATOMIC); 1772 if (!error) { 1773 i915_disable_error_state(i915, -ENOMEM); 1774 return ERR_PTR(-ENOMEM); 1775 } 1776 1777 kref_init(&error->ref); 1778 error->i915 = i915; 1779 1780 stop_machine(capture, error, NULL); 1781 1782 return error; 1783 } 1784 1785 /** 1786 * i915_capture_error_state - capture an error record for later analysis 1787 * @i915: i915 device 1788 * @engine_mask: the mask of engines triggering the hang 1789 * @msg: a message to insert into the error capture header 1790 * 1791 * Should be called when an error is detected (either a hang or an error 1792 * interrupt) to capture error state from the time of the error. Fills 1793 * out a structure which becomes available in debugfs for user level tools 1794 * to pick up. 1795 */ 1796 void i915_capture_error_state(struct drm_i915_private *i915, 1797 intel_engine_mask_t engine_mask, 1798 const char *msg) 1799 { 1800 static bool warned; 1801 struct i915_gpu_state *error; 1802 unsigned long flags; 1803 1804 if (!i915_modparams.error_capture) 1805 return; 1806 1807 if (READ_ONCE(i915->gpu_error.first_error)) 1808 return; 1809 1810 error = i915_capture_gpu_state(i915); 1811 if (IS_ERR(error)) 1812 return; 1813 1814 dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg)); 1815 1816 if (!error->simulated) { 1817 spin_lock_irqsave(&i915->gpu_error.lock, flags); 1818 if (!i915->gpu_error.first_error) { 1819 i915->gpu_error.first_error = error; 1820 error = NULL; 1821 } 1822 spin_unlock_irqrestore(&i915->gpu_error.lock, flags); 1823 } 1824 1825 if (error) { 1826 __i915_gpu_state_free(&error->ref); 1827 return; 1828 } 1829 1830 if (!warned && 1831 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { 1832 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); 1833 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); 1834 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); 1835 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n"); 1836 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", 1837 i915->drm.primary->index); 1838 warned = true; 1839 } 1840 } 1841 1842 struct i915_gpu_state * 1843 i915_first_error_state(struct drm_i915_private *i915) 1844 { 1845 struct i915_gpu_state *error; 1846 1847 spin_lock_irq(&i915->gpu_error.lock); 1848 error = i915->gpu_error.first_error; 1849 if (!IS_ERR_OR_NULL(error)) 1850 i915_gpu_state_get(error); 1851 spin_unlock_irq(&i915->gpu_error.lock); 1852 1853 return error; 1854 } 1855 1856 void i915_reset_error_state(struct drm_i915_private *i915) 1857 { 1858 struct i915_gpu_state *error; 1859 1860 spin_lock_irq(&i915->gpu_error.lock); 1861 error = i915->gpu_error.first_error; 1862 if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */ 1863 i915->gpu_error.first_error = NULL; 1864 spin_unlock_irq(&i915->gpu_error.lock); 1865 1866 if (!IS_ERR_OR_NULL(error)) 1867 i915_gpu_state_put(error); 1868 } 1869 1870 void i915_disable_error_state(struct drm_i915_private *i915, int err) 1871 { 1872 spin_lock_irq(&i915->gpu_error.lock); 1873 if (!i915->gpu_error.first_error) 1874 i915->gpu_error.first_error = ERR_PTR(err); 1875 spin_unlock_irq(&i915->gpu_error.lock); 1876 } 1877