refactor: surface extraction cleanup failures as visible diagnostics

- Add IMH_EXTRACTION_CLEANUP_FAILED and IMH_EXTRACTION_RESIDUAL_ATTRIBUTES
  diagnostic codes with severity warning and extraction-error category
- Check for residual data-imhotep-runtime-id attributes before extraction
  (both fast-geometry and CDP paths), emit diagnostic if prior run leaked them
- Surface cleanup failures (attribute removal, CDP session detach) as
  returned diagnostics instead of silent console.warn
- Wrap CDP sessionManager.detach() in try/catch to prevent finally-block
  unhandled throws on closed/navigated pages
- Count injected vs cleaned runtime-id attributes; report mismatch as
  IMH_EXTRACTION_CLEANUP_FAILED with metrics
- Move errors array declaration before try block in fast-geometry path
  so finally can append cleanup diagnostics
This commit is contained in:
John Dvorak
2026-05-22 16:06:21 -07:00
parent 066ef9f677
commit ce04b2b3de
2 changed files with 163 additions and 15 deletions
+13 -1
View File
@@ -88,6 +88,8 @@ export type DiagnosticCode =
| 'IMH_STYLE_FAILED' | 'IMH_STYLE_FAILED'
| 'IMH_TOPOLOGY_PARTIAL' | 'IMH_TOPOLOGY_PARTIAL'
| 'IMH_TOPOLOGY_FAILED' | 'IMH_TOPOLOGY_FAILED'
| 'IMH_EXTRACTION_CLEANUP_FAILED'
| 'IMH_EXTRACTION_RESIDUAL_ATTRIBUTES'
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
// Extractor planner errors (imhotep-extractor) // Extractor planner errors (imhotep-extractor)
@@ -420,7 +422,7 @@ export function getDefaultCategory(code: DiagnosticCode): DiagnosticCategory {
if (code.startsWith('IMH_VALID_')) return 'validation-error' if (code.startsWith('IMH_VALID_')) return 'validation-error'
if (code.startsWith('IMH_SELECTOR_') || code.startsWith('IMH_FRAME_') || code.startsWith('IMH_STATE_MATERIALIZATION')) return 'resolution-error' if (code.startsWith('IMH_SELECTOR_') || code.startsWith('IMH_FRAME_') || code.startsWith('IMH_STATE_MATERIALIZATION')) return 'resolution-error'
if (code.startsWith('IMH_EXTRACTOR_')) return 'resolution-error' if (code.startsWith('IMH_EXTRACTOR_')) return 'resolution-error'
if (code.startsWith('IMH_EXTRACT_') || code.startsWith('IMH_CDP_') || code.startsWith('IMH_DOM_') || code.startsWith('IMH_BOX_MODEL') || code.startsWith('IMH_VISUAL_BOX') || code.startsWith('IMH_FRAGMENT') || code.startsWith('IMH_TRANSFORM') || code.startsWith('IMH_STYLE') || code.startsWith('IMH_TOPOLOGY_PARTIAL') || code.startsWith('IMH_TOPOLOGY_FAILED')) return 'extraction-error' if (code.startsWith('IMH_EXTRACT_') || code.startsWith('IMH_EXTRACTION_') || code.startsWith('IMH_CDP_') || code.startsWith('IMH_DOM_') || code.startsWith('IMH_BOX_MODEL') || code.startsWith('IMH_VISUAL_BOX') || code.startsWith('IMH_FRAGMENT') || code.startsWith('IMH_TRANSFORM') || code.startsWith('IMH_STYLE') || code.startsWith('IMH_TOPOLOGY_PARTIAL') || code.startsWith('IMH_TOPOLOGY_FAILED')) return 'extraction-error'
if (code.startsWith('IMH_RELATION_') || code.startsWith('IMH_SIZE_') || code.startsWith('IMH_ALIGNMENT') || code.startsWith('IMH_TOPOLOGY_CLIPPED') || code.startsWith('IMH_TOPOLOGY_STACKING') || code.startsWith('IMH_VISIBILITY') || code.startsWith('IMH_PREDICATE') || code.startsWith('IMH_CARDINALITY')) return 'contract-failure' if (code.startsWith('IMH_RELATION_') || code.startsWith('IMH_SIZE_') || code.startsWith('IMH_ALIGNMENT') || code.startsWith('IMH_TOPOLOGY_CLIPPED') || code.startsWith('IMH_TOPOLOGY_STACKING') || code.startsWith('IMH_VISIBILITY') || code.startsWith('IMH_PREDICATE') || code.startsWith('IMH_CARDINALITY')) return 'contract-failure'
if (code.startsWith('IMH_FACT_OBSERVED_')) return 'contract-failure' if (code.startsWith('IMH_FACT_OBSERVED_')) return 'contract-failure'
if (code.startsWith('IMH_PROPERTY_') || code.startsWith('IMH_ENUMERATED_') || code === 'IMH_PROPERTY_RUN_FAILED') return 'contract-failure' if (code.startsWith('IMH_PROPERTY_') || code.startsWith('IMH_ENUMERATED_') || code === 'IMH_PROPERTY_RUN_FAILED') return 'contract-failure'
@@ -453,6 +455,16 @@ export function getDefaultFixHints(code: DiagnosticCode): string[] {
hints.push('The selector matches more than one element. Use a more specific selector or add a quantifier.') hints.push('The selector matches more than one element. Use a more specific selector or add a quantifier.')
} }
if (code === 'IMH_EXTRACTION_CLEANUP_FAILED') {
hints.push('Verify the page is still interactive (not closed or navigated away).')
hints.push('If using CDP mode, check that the browser connection is healthy.')
}
if (code === 'IMH_EXTRACTION_RESIDUAL_ATTRIBUTES') {
hints.push('Leftover data-imhotep-runtime-id attributes indicate a prior extraction did not clean up.')
hints.push('A page reload or navigating away and back may clear residual attributes.')
}
if (code === 'IMH_EXTRACT_PROTOCOL_ERROR' || code === 'IMH_CDP_SESSION_ATTACH_FAILED') { if (code === 'IMH_EXTRACT_PROTOCOL_ERROR' || code === 'IMH_CDP_SESSION_ATTACH_FAILED') {
hints.push('Verify the page is fully loaded before running assertions.') hints.push('Verify the page is fully loaded before running assertions.')
hints.push('Check that selectors are valid CSS selectors or semantic references.') hints.push('Check that selectors are valid CSS selectors or semantic references.')
+150 -14
View File
@@ -418,12 +418,35 @@ export async function extractWorldFastGeometry(
selectorToIds: Array<[string, number[]]> selectorToIds: Array<[string, number[]]>
} }
const errors: ImhotepDiagnostic[] = []
const selectorPlans: SelectorPlan[] = [] const selectorPlans: SelectorPlan[] = []
for (let i = 0; i < selectors.length; i++) { for (let i = 0; i < selectors.length; i++) {
const queries = await materializeSemanticSelector(playwrightPage, selectors[i], i) const queries = await materializeSemanticSelector(playwrightPage, selectors[i], i)
selectorPlans.push({ key: selectors[i], queries }) selectorPlans.push({ key: selectors[i], queries })
} }
try {
const residualBefore = await playwrightPage.evaluate(() =>
document.querySelectorAll('[data-imhotep-runtime-id]').length,
)
if (residualBefore > 0) {
errors.push({
code: 'IMH_EXTRACTION_RESIDUAL_ATTRIBUTES',
severity: 'warning',
category: 'extraction-error',
message: `Found ${residualBefore} residual data-imhotep-runtime-id attribute(s) from a prior extraction that did not clean up.`,
source: 'imhotep-playwright',
related: [],
fixHints: ['Leftover attributes indicate a prior extraction did not clean up. A page reload or navigating away and back may clear residual attributes.'],
metrics: { residualCount: residualBefore },
sourceRef: {},
})
}
} catch {
// Best-effort pre-check; proceed with extraction.
}
try { try {
const extracted = await playwrightPage.evaluate(({ plans, needs }: any) => { const extracted = await playwrightPage.evaluate(({ plans, needs }: any) => {
const elements: FastExtractedElement[] = [] const elements: FastExtractedElement[] = []
@@ -701,7 +724,6 @@ export async function extractWorldFastGeometry(
} }
const selectorToIds = new Map<string, number[]>(extracted.selectorToIds) const selectorToIds = new Map<string, number[]>(extracted.selectorToIds)
const errors: ImhotepDiagnostic[] = []
for (const [selector, ids] of selectorToIds) { for (const [selector, ids] of selectorToIds) {
if (ids.length === 0 && !selector.startsWith('$')) { if (ids.length === 0 && !selector.startsWith('$')) {
@@ -732,15 +754,45 @@ export async function extractWorldFastGeometry(
} }
return { world, selectorToIds, errors } return { world, selectorToIds, errors }
} finally { } finally {
await playwrightPage.evaluate(() => { try {
const nodes = Array.from(document.querySelectorAll('[data-imhotep-runtime-id]')) const result = await playwrightPage.evaluate(() => {
for (const el of nodes) { const nodes = Array.from(document.querySelectorAll<HTMLElement>('[data-imhotep-runtime-id]'))
el.removeAttribute('data-imhotep-runtime-id') const cleaned = nodes.length
for (const el of nodes) {
el.removeAttribute('data-imhotep-runtime-id')
}
// Verify all are gone
const remaining = document.querySelectorAll('[data-imhotep-runtime-id]').length
return { cleaned, remaining }
})
if (result.remaining > 0) {
errors.push({
code: 'IMH_EXTRACTION_CLEANUP_FAILED',
severity: 'warning',
category: 'extraction-error',
message: `Cleanup incomplete: ${result.remaining} data-imhotep-runtime-id attribute(s) could not be removed (injected ${result.cleaned}).`,
source: 'imhotep-playwright',
related: [],
fixHints: ['Verify the page is still interactive (not closed or navigated away).'],
metrics: { injected: result.cleaned, remaining: result.remaining },
sourceRef: {},
})
} }
}).catch((err) => { } catch (err) {
errors.push({
code: 'IMH_EXTRACTION_CLEANUP_FAILED',
severity: 'warning',
category: 'extraction-error',
message: `Fast-geometry cleanup failed: ${err instanceof Error ? err.message : String(err)}`,
source: 'imhotep-playwright',
related: [],
fixHints: ['Verify the page is still interactive (not closed or navigated away).'],
metrics: {},
sourceRef: {},
})
// eslint-disable-next-line no-console // eslint-disable-next-line no-console
console.warn('[imhotep-playwright] fast-geometry cleanup failed:', err instanceof Error ? err.message : err) console.warn('[imhotep-playwright] fast-geometry cleanup failed:', err instanceof Error ? err.message : err)
}) }
} }
} }
@@ -818,6 +870,28 @@ export async function extractWorldCdp(
} }
const sessionManager = createSessionManager(playwrightPage) const sessionManager = createSessionManager(playwrightPage)
try {
const residualBefore = await playwrightPage.evaluate(() =>
document.querySelectorAll('[data-imhotep-runtime-id]').length,
)
if (residualBefore > 0) {
errors.push({
code: 'IMH_EXTRACTION_RESIDUAL_ATTRIBUTES',
severity: 'warning',
category: 'extraction-error',
message: `Found ${residualBefore} residual data-imhotep-runtime-id attribute(s) from a prior extraction that did not clean up.`,
source: 'imhotep-cdp',
related: [],
fixHints: ['Leftover attributes indicate a prior extraction did not clean up. A page reload or navigating away and back may clear residual attributes.'],
metrics: { residualCount: residualBefore },
sourceRef: {},
})
}
} catch {
// Best-effort pre-check; proceed with extraction.
}
try { try {
await sessionManager.enableDomain('DOM') await sessionManager.enableDomain('DOM')
@@ -925,16 +999,78 @@ export async function extractWorldCdp(
} }
return { world, selectorToIds, errors } return { world, selectorToIds, errors }
} finally { } finally {
await playwrightPage.evaluate(() => { try {
const nodes = Array.from(document.querySelectorAll('[data-imhotep-runtime-id]')) const result = await playwrightPage.evaluate(() => {
for (const el of nodes) { const nodes = Array.from(document.querySelectorAll<HTMLElement>('[data-imhotep-runtime-id]'))
el.removeAttribute('data-imhotep-runtime-id') const cleaned = nodes.length
for (const el of nodes) {
el.removeAttribute('data-imhotep-runtime-id')
}
const remaining = document.querySelectorAll('[data-imhotep-runtime-id]').length
return { cleaned, remaining }
})
if (result.remaining > 0) {
errors.push({
code: 'IMH_EXTRACTION_CLEANUP_FAILED',
severity: 'warning',
category: 'extraction-error',
message: `CDP cleanup incomplete: ${result.remaining} data-imhotep-runtime-id attribute(s) could not be removed (injected ${result.cleaned}).`,
source: 'imhotep-cdp',
related: [],
fixHints: ['Verify the page is still interactive (not closed or navigated away).'],
metrics: { injected: result.cleaned, remaining: result.remaining },
sourceRef: {},
})
} }
}).catch((err) => { } catch (err) {
errors.push({
code: 'IMH_EXTRACTION_CLEANUP_FAILED',
severity: 'warning',
category: 'extraction-error',
message: `CDP attribute cleanup failed: ${err instanceof Error ? err.message : String(err)}`,
source: 'imhotep-cdp',
related: [],
fixHints: ['Verify the page is still interactive (not closed or navigated away).'],
metrics: {},
sourceRef: {},
})
// eslint-disable-next-line no-console // eslint-disable-next-line no-console
console.warn('[imhotep-playwright] CDP cleanup failed:', err instanceof Error ? err.message : err) console.warn('[imhotep-playwright] CDP cleanup failed:', err instanceof Error ? err.message : err)
}) }
await sessionManager.detach() try {
await sessionManager.detach()
} catch (err) {
errors.push({
code: 'IMH_EXTRACTION_CLEANUP_FAILED',
severity: 'warning',
category: 'extraction-error',
message: `CDP session detach failed: ${err instanceof Error ? err.message : String(err)}`,
source: 'imhotep-cdp',
related: [],
fixHints: ['The CDP session may already be closed. This is typically non-fatal.'],
metrics: {},
sourceRef: {},
})
// eslint-disable-next-line no-console
console.warn('[imhotep-playwright] CDP cleanup failed:', err instanceof Error ? err.message : err)
}
try {
await sessionManager.detach()
} catch (err) {
errors.push({
code: 'IMH_EXTRACTION_CLEANUP_FAILED',
severity: 'warning',
category: 'extraction-error',
message: `CDP session detach failed: ${err instanceof Error ? err.message : String(err)}`,
source: 'imhotep-cdp',
related: [],
fixHints: ['The CDP session may already be closed. This is typically non-fatal.'],
metrics: {},
sourceRef: {},
})
// eslint-disable-next-line no-console
console.warn('[imhotep-playwright] CDP session detach failed:', err instanceof Error ? err.message : err)
}
} }
} }