From ce04b2b3de031b5ced50d2ddc04683ac57515c2e Mon Sep 17 00:00:00 2001 From: John Dvorak Date: Fri, 22 May 2026 16:06:21 -0700 Subject: [PATCH] refactor: surface extraction cleanup failures as visible diagnostics - Add IMH_EXTRACTION_CLEANUP_FAILED and IMH_EXTRACTION_RESIDUAL_ATTRIBUTES diagnostic codes with severity warning and extraction-error category - Check for residual data-imhotep-runtime-id attributes before extraction (both fast-geometry and CDP paths), emit diagnostic if prior run leaked them - Surface cleanup failures (attribute removal, CDP session detach) as returned diagnostics instead of silent console.warn - Wrap CDP sessionManager.detach() in try/catch to prevent finally-block unhandled throws on closed/navigated pages - Count injected vs cleaned runtime-id attributes; report mismatch as IMH_EXTRACTION_CLEANUP_FAILED with metrics - Move errors array declaration before try block in fast-geometry path so finally can append cleanup diagnostics --- packages/imhotep-core/src/diagnostics.ts | 14 +- packages/imhotep-playwright/src/extraction.ts | 164 ++++++++++++++++-- 2 files changed, 163 insertions(+), 15 deletions(-) diff --git a/packages/imhotep-core/src/diagnostics.ts b/packages/imhotep-core/src/diagnostics.ts index e6bc9bb..8af1934 100644 --- a/packages/imhotep-core/src/diagnostics.ts +++ b/packages/imhotep-core/src/diagnostics.ts @@ -88,6 +88,8 @@ export type DiagnosticCode = | 'IMH_STYLE_FAILED' | 'IMH_TOPOLOGY_PARTIAL' | 'IMH_TOPOLOGY_FAILED' + | 'IMH_EXTRACTION_CLEANUP_FAILED' + | 'IMH_EXTRACTION_RESIDUAL_ATTRIBUTES' // ------------------------------------------------------------------------- // Extractor planner errors (imhotep-extractor) @@ -420,7 +422,7 @@ export function getDefaultCategory(code: DiagnosticCode): DiagnosticCategory { if (code.startsWith('IMH_VALID_')) return 'validation-error' if (code.startsWith('IMH_SELECTOR_') || code.startsWith('IMH_FRAME_') || code.startsWith('IMH_STATE_MATERIALIZATION')) return 'resolution-error' if (code.startsWith('IMH_EXTRACTOR_')) return 'resolution-error' - if (code.startsWith('IMH_EXTRACT_') || code.startsWith('IMH_CDP_') || code.startsWith('IMH_DOM_') || code.startsWith('IMH_BOX_MODEL') || code.startsWith('IMH_VISUAL_BOX') || code.startsWith('IMH_FRAGMENT') || code.startsWith('IMH_TRANSFORM') || code.startsWith('IMH_STYLE') || code.startsWith('IMH_TOPOLOGY_PARTIAL') || code.startsWith('IMH_TOPOLOGY_FAILED')) return 'extraction-error' + if (code.startsWith('IMH_EXTRACT_') || code.startsWith('IMH_EXTRACTION_') || code.startsWith('IMH_CDP_') || code.startsWith('IMH_DOM_') || code.startsWith('IMH_BOX_MODEL') || code.startsWith('IMH_VISUAL_BOX') || code.startsWith('IMH_FRAGMENT') || code.startsWith('IMH_TRANSFORM') || code.startsWith('IMH_STYLE') || code.startsWith('IMH_TOPOLOGY_PARTIAL') || code.startsWith('IMH_TOPOLOGY_FAILED')) return 'extraction-error' if (code.startsWith('IMH_RELATION_') || code.startsWith('IMH_SIZE_') || code.startsWith('IMH_ALIGNMENT') || code.startsWith('IMH_TOPOLOGY_CLIPPED') || code.startsWith('IMH_TOPOLOGY_STACKING') || code.startsWith('IMH_VISIBILITY') || code.startsWith('IMH_PREDICATE') || code.startsWith('IMH_CARDINALITY')) return 'contract-failure' if (code.startsWith('IMH_FACT_OBSERVED_')) return 'contract-failure' if (code.startsWith('IMH_PROPERTY_') || code.startsWith('IMH_ENUMERATED_') || code === 'IMH_PROPERTY_RUN_FAILED') return 'contract-failure' @@ -453,6 +455,16 @@ export function getDefaultFixHints(code: DiagnosticCode): string[] { hints.push('The selector matches more than one element. Use a more specific selector or add a quantifier.') } + if (code === 'IMH_EXTRACTION_CLEANUP_FAILED') { + hints.push('Verify the page is still interactive (not closed or navigated away).') + hints.push('If using CDP mode, check that the browser connection is healthy.') + } + + if (code === 'IMH_EXTRACTION_RESIDUAL_ATTRIBUTES') { + hints.push('Leftover data-imhotep-runtime-id attributes indicate a prior extraction did not clean up.') + hints.push('A page reload or navigating away and back may clear residual attributes.') + } + if (code === 'IMH_EXTRACT_PROTOCOL_ERROR' || code === 'IMH_CDP_SESSION_ATTACH_FAILED') { hints.push('Verify the page is fully loaded before running assertions.') hints.push('Check that selectors are valid CSS selectors or semantic references.') diff --git a/packages/imhotep-playwright/src/extraction.ts b/packages/imhotep-playwright/src/extraction.ts index 533ca9d..ccd9554 100644 --- a/packages/imhotep-playwright/src/extraction.ts +++ b/packages/imhotep-playwright/src/extraction.ts @@ -418,12 +418,35 @@ export async function extractWorldFastGeometry( selectorToIds: Array<[string, number[]]> } + const errors: ImhotepDiagnostic[] = [] + const selectorPlans: SelectorPlan[] = [] for (let i = 0; i < selectors.length; i++) { const queries = await materializeSemanticSelector(playwrightPage, selectors[i], i) selectorPlans.push({ key: selectors[i], queries }) } + try { + const residualBefore = await playwrightPage.evaluate(() => + document.querySelectorAll('[data-imhotep-runtime-id]').length, + ) + if (residualBefore > 0) { + errors.push({ + code: 'IMH_EXTRACTION_RESIDUAL_ATTRIBUTES', + severity: 'warning', + category: 'extraction-error', + message: `Found ${residualBefore} residual data-imhotep-runtime-id attribute(s) from a prior extraction that did not clean up.`, + source: 'imhotep-playwright', + related: [], + fixHints: ['Leftover attributes indicate a prior extraction did not clean up. A page reload or navigating away and back may clear residual attributes.'], + metrics: { residualCount: residualBefore }, + sourceRef: {}, + }) + } + } catch { + // Best-effort pre-check; proceed with extraction. + } + try { const extracted = await playwrightPage.evaluate(({ plans, needs }: any) => { const elements: FastExtractedElement[] = [] @@ -701,7 +724,6 @@ export async function extractWorldFastGeometry( } const selectorToIds = new Map(extracted.selectorToIds) - const errors: ImhotepDiagnostic[] = [] for (const [selector, ids] of selectorToIds) { if (ids.length === 0 && !selector.startsWith('$')) { @@ -732,15 +754,45 @@ export async function extractWorldFastGeometry( } return { world, selectorToIds, errors } } finally { - await playwrightPage.evaluate(() => { - const nodes = Array.from(document.querySelectorAll('[data-imhotep-runtime-id]')) - for (const el of nodes) { - el.removeAttribute('data-imhotep-runtime-id') + try { + const result = await playwrightPage.evaluate(() => { + const nodes = Array.from(document.querySelectorAll('[data-imhotep-runtime-id]')) + const cleaned = nodes.length + for (const el of nodes) { + el.removeAttribute('data-imhotep-runtime-id') + } + // Verify all are gone + const remaining = document.querySelectorAll('[data-imhotep-runtime-id]').length + return { cleaned, remaining } + }) + if (result.remaining > 0) { + errors.push({ + code: 'IMH_EXTRACTION_CLEANUP_FAILED', + severity: 'warning', + category: 'extraction-error', + message: `Cleanup incomplete: ${result.remaining} data-imhotep-runtime-id attribute(s) could not be removed (injected ${result.cleaned}).`, + source: 'imhotep-playwright', + related: [], + fixHints: ['Verify the page is still interactive (not closed or navigated away).'], + metrics: { injected: result.cleaned, remaining: result.remaining }, + sourceRef: {}, + }) } - }).catch((err) => { + } catch (err) { + errors.push({ + code: 'IMH_EXTRACTION_CLEANUP_FAILED', + severity: 'warning', + category: 'extraction-error', + message: `Fast-geometry cleanup failed: ${err instanceof Error ? err.message : String(err)}`, + source: 'imhotep-playwright', + related: [], + fixHints: ['Verify the page is still interactive (not closed or navigated away).'], + metrics: {}, + sourceRef: {}, + }) // eslint-disable-next-line no-console console.warn('[imhotep-playwright] fast-geometry cleanup failed:', err instanceof Error ? err.message : err) - }) + } } } @@ -818,6 +870,28 @@ export async function extractWorldCdp( } const sessionManager = createSessionManager(playwrightPage) + + try { + const residualBefore = await playwrightPage.evaluate(() => + document.querySelectorAll('[data-imhotep-runtime-id]').length, + ) + if (residualBefore > 0) { + errors.push({ + code: 'IMH_EXTRACTION_RESIDUAL_ATTRIBUTES', + severity: 'warning', + category: 'extraction-error', + message: `Found ${residualBefore} residual data-imhotep-runtime-id attribute(s) from a prior extraction that did not clean up.`, + source: 'imhotep-cdp', + related: [], + fixHints: ['Leftover attributes indicate a prior extraction did not clean up. A page reload or navigating away and back may clear residual attributes.'], + metrics: { residualCount: residualBefore }, + sourceRef: {}, + }) + } + } catch { + // Best-effort pre-check; proceed with extraction. + } + try { await sessionManager.enableDomain('DOM') @@ -925,16 +999,78 @@ export async function extractWorldCdp( } return { world, selectorToIds, errors } } finally { - await playwrightPage.evaluate(() => { - const nodes = Array.from(document.querySelectorAll('[data-imhotep-runtime-id]')) - for (const el of nodes) { - el.removeAttribute('data-imhotep-runtime-id') + try { + const result = await playwrightPage.evaluate(() => { + const nodes = Array.from(document.querySelectorAll('[data-imhotep-runtime-id]')) + const cleaned = nodes.length + for (const el of nodes) { + el.removeAttribute('data-imhotep-runtime-id') + } + const remaining = document.querySelectorAll('[data-imhotep-runtime-id]').length + return { cleaned, remaining } + }) + if (result.remaining > 0) { + errors.push({ + code: 'IMH_EXTRACTION_CLEANUP_FAILED', + severity: 'warning', + category: 'extraction-error', + message: `CDP cleanup incomplete: ${result.remaining} data-imhotep-runtime-id attribute(s) could not be removed (injected ${result.cleaned}).`, + source: 'imhotep-cdp', + related: [], + fixHints: ['Verify the page is still interactive (not closed or navigated away).'], + metrics: { injected: result.cleaned, remaining: result.remaining }, + sourceRef: {}, + }) } - }).catch((err) => { + } catch (err) { + errors.push({ + code: 'IMH_EXTRACTION_CLEANUP_FAILED', + severity: 'warning', + category: 'extraction-error', + message: `CDP attribute cleanup failed: ${err instanceof Error ? err.message : String(err)}`, + source: 'imhotep-cdp', + related: [], + fixHints: ['Verify the page is still interactive (not closed or navigated away).'], + metrics: {}, + sourceRef: {}, + }) // eslint-disable-next-line no-console console.warn('[imhotep-playwright] CDP cleanup failed:', err instanceof Error ? err.message : err) - }) - await sessionManager.detach() + } + try { + await sessionManager.detach() + } catch (err) { + errors.push({ + code: 'IMH_EXTRACTION_CLEANUP_FAILED', + severity: 'warning', + category: 'extraction-error', + message: `CDP session detach failed: ${err instanceof Error ? err.message : String(err)}`, + source: 'imhotep-cdp', + related: [], + fixHints: ['The CDP session may already be closed. This is typically non-fatal.'], + metrics: {}, + sourceRef: {}, + }) + // eslint-disable-next-line no-console + console.warn('[imhotep-playwright] CDP cleanup failed:', err instanceof Error ? err.message : err) + } + try { + await sessionManager.detach() + } catch (err) { + errors.push({ + code: 'IMH_EXTRACTION_CLEANUP_FAILED', + severity: 'warning', + category: 'extraction-error', + message: `CDP session detach failed: ${err instanceof Error ? err.message : String(err)}`, + source: 'imhotep-cdp', + related: [], + fixHints: ['The CDP session may already be closed. This is typically non-fatal.'], + metrics: {}, + sourceRef: {}, + }) + // eslint-disable-next-line no-console + console.warn('[imhotep-playwright] CDP session detach failed:', err instanceof Error ? err.message : err) + } } }