diff options
Diffstat (limited to 'drivers/gpu/drm/i915/intel_hangcheck.c')
-rw-r--r-- | drivers/gpu/drm/i915/intel_hangcheck.c | 256 |
1 files changed, 144 insertions, 112 deletions
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index 53df5b11bff4..f05971f5586f 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -236,13 +236,13 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd) memset(&engine->hangcheck.instdone, 0, sizeof(engine->hangcheck.instdone)); - return HANGCHECK_ACTIVE; + return ENGINE_ACTIVE_HEAD; } if (!subunits_stuck(engine)) - return HANGCHECK_ACTIVE; + return ENGINE_ACTIVE_SUBUNITS; - return HANGCHECK_HUNG; + return ENGINE_DEAD; } static enum intel_engine_hangcheck_action @@ -253,11 +253,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) u32 tmp; ha = head_stuck(engine, acthd); - if (ha != HANGCHECK_HUNG) + if (ha != ENGINE_DEAD) return ha; if (IS_GEN2(dev_priv)) - return HANGCHECK_HUNG; + return ENGINE_DEAD; /* Is the chip hanging on a WAIT_FOR_EVENT? * If so we can simply poke the RB_WAIT bit @@ -270,25 +270,144 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) "Kicking stuck wait on %s", engine->name); I915_WRITE_CTL(engine, tmp); - return HANGCHECK_KICK; + return ENGINE_WAIT_KICK; } if (INTEL_GEN(dev_priv) >= 6 && tmp & RING_WAIT_SEMAPHORE) { switch (semaphore_passed(engine)) { default: - return HANGCHECK_HUNG; + return ENGINE_DEAD; case 1: i915_handle_error(dev_priv, 0, "Kicking stuck semaphore on %s", engine->name); I915_WRITE_CTL(engine, tmp); - return HANGCHECK_KICK; + return ENGINE_WAIT_KICK; case 0: - return HANGCHECK_WAIT; + return ENGINE_WAIT; } } - return HANGCHECK_HUNG; + return ENGINE_DEAD; +} + +static void hangcheck_load_sample(struct intel_engine_cs *engine, + struct intel_engine_hangcheck *hc) +{ + /* We don't strictly need an irq-barrier here, as we are not + * serving an interrupt request, be paranoid in case the + * barrier has side-effects (such as preventing a broken + * cacheline snoop) and so be sure that we can see the seqno + * advance. If the seqno should stick, due to a stale + * cacheline, we would erroneously declare the GPU hung. + */ + if (engine->irq_seqno_barrier) + engine->irq_seqno_barrier(engine); + + hc->acthd = intel_engine_get_active_head(engine); + hc->seqno = intel_engine_get_seqno(engine); +} + +static void hangcheck_store_sample(struct intel_engine_cs *engine, + const struct intel_engine_hangcheck *hc) +{ + engine->hangcheck.acthd = hc->acthd; + engine->hangcheck.seqno = hc->seqno; + engine->hangcheck.action = hc->action; + engine->hangcheck.stalled = hc->stalled; +} + +static enum intel_engine_hangcheck_action +hangcheck_get_action(struct intel_engine_cs *engine, + const struct intel_engine_hangcheck *hc) +{ + if (engine->hangcheck.seqno != hc->seqno) + return ENGINE_ACTIVE_SEQNO; + + if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine))) + return ENGINE_IDLE; + + return engine_stuck(engine, hc->acthd); +} + +static void hangcheck_accumulate_sample(struct intel_engine_cs *engine, + struct intel_engine_hangcheck *hc) +{ + unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT; + + hc->action = hangcheck_get_action(engine, hc); + + /* We always increment the progress + * if the engine is busy and still processing + * the same request, so that no single request + * can run indefinitely (such as a chain of + * batches). The only time we do not increment + * the hangcheck score on this ring, if this + * engine is in a legitimate wait for another + * engine. In that case the waiting engine is a + * victim and we want to be sure we catch the + * right culprit. Then every time we do kick + * the ring, make it as a progress as the seqno + * advancement might ensure and if not, it + * will catch the hanging engine. + */ + + switch (hc->action) { + case ENGINE_IDLE: + case ENGINE_ACTIVE_SEQNO: + /* Clear head and subunit states on seqno movement */ + hc->acthd = 0; + + memset(&engine->hangcheck.instdone, 0, + sizeof(engine->hangcheck.instdone)); + + /* Intentional fall through */ + case ENGINE_WAIT_KICK: + case ENGINE_WAIT: + engine->hangcheck.action_timestamp = jiffies; + break; + + case ENGINE_ACTIVE_HEAD: + case ENGINE_ACTIVE_SUBUNITS: + /* Seqno stuck with still active engine gets leeway, + * in hopes that it is just a long shader. + */ + timeout = I915_SEQNO_DEAD_TIMEOUT; + break; + + case ENGINE_DEAD: + break; + + default: + MISSING_CASE(hc->action); + } + + hc->stalled = time_after(jiffies, + engine->hangcheck.action_timestamp + timeout); +} + +static void hangcheck_declare_hang(struct drm_i915_private *i915, + unsigned int hung, + unsigned int stuck) +{ + struct intel_engine_cs *engine; + char msg[80]; + unsigned int tmp; + int len; + + /* If some rings hung but others were still busy, only + * blame the hanging rings in the synopsis. + */ + if (stuck != hung) + hung &= ~stuck; + len = scnprintf(msg, sizeof(msg), + "%s on ", stuck == hung ? "No progress" : "Hang"); + for_each_engine_masked(engine, i915, hung, tmp) + len += scnprintf(msg + len, sizeof(msg) - len, + "%s, ", engine->name); + msg[len-2] = '\0'; + + return i915_handle_error(i915, hung, msg); } /* @@ -308,10 +427,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work) enum intel_engine_id id; unsigned int hung = 0, stuck = 0; int busy_count = 0; -#define BUSY 1 -#define KICK 5 -#define HUNG 20 -#define ACTIVE_DECAY 15 if (!i915.enable_hangcheck) return; @@ -319,6 +434,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work) if (!READ_ONCE(dev_priv->gt.awake)) return; + if (i915_terminally_wedged(&dev_priv->gpu_error)) + return; + /* As enabling the GPU requires fairly extensive mmio access, * periodically arm the mmio checker to see if we are triggering * any invalid access. @@ -326,112 +444,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work) intel_uncore_arm_unclaimed_mmio_detection(dev_priv); for_each_engine(engine, dev_priv, id) { - bool busy = intel_engine_has_waiter(engine); - u64 acthd; - u32 seqno; - u32 submit; + struct intel_engine_hangcheck cur_state, *hc = &cur_state; + const bool busy = intel_engine_has_waiter(engine); semaphore_clear_deadlocks(dev_priv); - /* We don't strictly need an irq-barrier here, as we are not - * serving an interrupt request, be paranoid in case the - * barrier has side-effects (such as preventing a broken - * cacheline snoop) and so be sure that we can see the seqno - * advance. If the seqno should stick, due to a stale - * cacheline, we would erroneously declare the GPU hung. - */ - if (engine->irq_seqno_barrier) - engine->irq_seqno_barrier(engine); - - acthd = intel_engine_get_active_head(engine); - seqno = intel_engine_get_seqno(engine); - submit = intel_engine_last_submit(engine); - - if (engine->hangcheck.seqno == seqno) { - if (i915_seqno_passed(seqno, submit)) { - engine->hangcheck.action = HANGCHECK_IDLE; - } else { - /* We always increment the hangcheck score - * if the engine is busy and still processing - * the same request, so that no single request - * can run indefinitely (such as a chain of - * batches). The only time we do not increment - * the hangcheck score on this ring, if this - * engine is in a legitimate wait for another - * engine. In that case the waiting engine is a - * victim and we want to be sure we catch the - * right culprit. Then every time we do kick - * the ring, add a small increment to the - * score so that we can catch a batch that is - * being repeatedly kicked and so responsible - * for stalling the machine. - */ - engine->hangcheck.action = - engine_stuck(engine, acthd); - - switch (engine->hangcheck.action) { - case HANGCHECK_IDLE: - case HANGCHECK_WAIT: - break; - case HANGCHECK_ACTIVE: - engine->hangcheck.score += BUSY; - break; - case HANGCHECK_KICK: - engine->hangcheck.score += KICK; - break; - case HANGCHECK_HUNG: - engine->hangcheck.score += HUNG; - break; - } - } - - if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) { - hung |= intel_engine_flag(engine); - if (engine->hangcheck.action != HANGCHECK_HUNG) - stuck |= intel_engine_flag(engine); - } - } else { - engine->hangcheck.action = HANGCHECK_ACTIVE; - - /* Gradually reduce the count so that we catch DoS - * attempts across multiple batches. - */ - if (engine->hangcheck.score > 0) - engine->hangcheck.score -= ACTIVE_DECAY; - if (engine->hangcheck.score < 0) - engine->hangcheck.score = 0; - - /* Clear head and subunit states on seqno movement */ - acthd = 0; - - memset(&engine->hangcheck.instdone, 0, - sizeof(engine->hangcheck.instdone)); + hangcheck_load_sample(engine, hc); + hangcheck_accumulate_sample(engine, hc); + hangcheck_store_sample(engine, hc); + + if (engine->hangcheck.stalled) { + hung |= intel_engine_flag(engine); + if (hc->action != ENGINE_DEAD) + stuck |= intel_engine_flag(engine); } - engine->hangcheck.seqno = seqno; - engine->hangcheck.acthd = acthd; busy_count += busy; } - if (hung) { - char msg[80]; - unsigned int tmp; - int len; - - /* If some rings hung but others were still busy, only - * blame the hanging rings in the synopsis. - */ - if (stuck != hung) - hung &= ~stuck; - len = scnprintf(msg, sizeof(msg), - "%s on ", stuck == hung ? "No progress" : "Hang"); - for_each_engine_masked(engine, dev_priv, hung, tmp) - len += scnprintf(msg + len, sizeof(msg) - len, - "%s, ", engine->name); - msg[len-2] = '\0'; - - return i915_handle_error(dev_priv, hung, msg); - } + if (hung) + hangcheck_declare_hang(dev_priv, hung, stuck); /* Reset timer in case GPU hangs without another request being added */ if (busy_count) |