393 lines
10 KiB
TypeScript
393 lines
10 KiB
TypeScript
|
|
/**
|
||
|
|
* Canonical Adapter
|
||
|
|
*
|
||
|
|
* Converts raw CDP extraction output into the canonical GeometryWorld
|
||
|
|
* shape defined by imhotep-core. This adapter isolates CDP-specific
|
||
|
|
* structures from the canonical representation.
|
||
|
|
*/
|
||
|
|
|
||
|
|
import type { GeometryWorldSnapshot } from './extractor.js'
|
||
|
|
import type { DOMExtractionResult } from './dom.js'
|
||
|
|
|
||
|
|
// We define local canonical interfaces to avoid cross-package import
|
||
|
|
// failures when imhotep-core dist is stale. These mirror the core
|
||
|
|
// canonical contracts exactly.
|
||
|
|
|
||
|
|
export interface CanonicalWorldEnvironment {
|
||
|
|
viewportWidth: number
|
||
|
|
viewportHeight: number
|
||
|
|
deviceScaleFactor: number
|
||
|
|
colorScheme: string
|
||
|
|
pointer: string
|
||
|
|
hover: boolean
|
||
|
|
reducedMotion: string
|
||
|
|
locale: string
|
||
|
|
writingMode: string
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalWorldSource {
|
||
|
|
url: string
|
||
|
|
browserName: string
|
||
|
|
browserVersion: string
|
||
|
|
engine: 'chromium-cdp'
|
||
|
|
extractedAt: number
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalStringTable {
|
||
|
|
values: string[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalSubjectsTable {
|
||
|
|
ids: number[]
|
||
|
|
domNodeId: number[]
|
||
|
|
subjectKind: number[]
|
||
|
|
primaryBoxId: number[]
|
||
|
|
firstFragmentId: number[]
|
||
|
|
fragmentCount: number[]
|
||
|
|
firstTextRunId: number[]
|
||
|
|
textRunCount: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalDomTable {
|
||
|
|
nodeId: number[]
|
||
|
|
backendNodeId: number[]
|
||
|
|
parentNodeId: number[]
|
||
|
|
firstChildIndex: number[]
|
||
|
|
childCount: number[]
|
||
|
|
shadowRootKind: number[]
|
||
|
|
tagNameStringId: number[]
|
||
|
|
roleStringId: number[]
|
||
|
|
ariaNameStringId: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalFramesTable {
|
||
|
|
frameId: number[]
|
||
|
|
frameKind: number[]
|
||
|
|
ownerSubjectId: number[]
|
||
|
|
parentFrameId: number[]
|
||
|
|
originX: number[]
|
||
|
|
originY: number[]
|
||
|
|
axisMatrixStart: number[]
|
||
|
|
clipRectId: number[]
|
||
|
|
scrollContainerId: number[]
|
||
|
|
writingMode: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalMatricesTable {
|
||
|
|
values: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalRectsTable {
|
||
|
|
rectId: number[]
|
||
|
|
left: number[]
|
||
|
|
top: number[]
|
||
|
|
right: number[]
|
||
|
|
bottom: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalBoxesTable {
|
||
|
|
boxId: number[]
|
||
|
|
subjectId: number[]
|
||
|
|
frameId: number[]
|
||
|
|
borderLeft: number[]
|
||
|
|
borderTop: number[]
|
||
|
|
borderRight: number[]
|
||
|
|
borderBottom: number[]
|
||
|
|
paddingLeft: number[]
|
||
|
|
paddingTop: number[]
|
||
|
|
paddingRight: number[]
|
||
|
|
paddingBottom: number[]
|
||
|
|
contentLeft: number[]
|
||
|
|
contentTop: number[]
|
||
|
|
contentRight: number[]
|
||
|
|
contentBottom: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalFragmentsTable {
|
||
|
|
fragmentId: number[]
|
||
|
|
subjectId: number[]
|
||
|
|
fragmentKind: number[]
|
||
|
|
boxLeft: number[]
|
||
|
|
boxTop: number[]
|
||
|
|
boxRight: number[]
|
||
|
|
boxBottom: number[]
|
||
|
|
lineIndex: number[]
|
||
|
|
flowIndex: number[]
|
||
|
|
parentFragmentId: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalTransformsTable {
|
||
|
|
transformId: number[]
|
||
|
|
subjectId: number[]
|
||
|
|
matrixStart: number[]
|
||
|
|
matrixLength: number[]
|
||
|
|
originX: number[]
|
||
|
|
originY: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalStylesTable {
|
||
|
|
subjectId: number[]
|
||
|
|
display: number[]
|
||
|
|
position: number[]
|
||
|
|
zIndexKind: number[]
|
||
|
|
zIndexValue: number[]
|
||
|
|
overflowX: number[]
|
||
|
|
overflowY: number[]
|
||
|
|
opacity: number[]
|
||
|
|
visibility: number[]
|
||
|
|
containFlags: number[]
|
||
|
|
pointerEvents: number[]
|
||
|
|
lineHeight: number[]
|
||
|
|
fontFamilyStringId: number[]
|
||
|
|
fontSize: number[]
|
||
|
|
fontWeight: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalTextTable {
|
||
|
|
runId: number[]
|
||
|
|
subjectId: number[]
|
||
|
|
contentStringId: number[]
|
||
|
|
lineBoxId: number[]
|
||
|
|
inkLeft: number[]
|
||
|
|
inkTop: number[]
|
||
|
|
inkRight: number[]
|
||
|
|
inkBottom: number[]
|
||
|
|
baselineY: number[]
|
||
|
|
capHeight: number[]
|
||
|
|
computedLineHeight: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalTopologyTable {
|
||
|
|
containingBlockOf: number[]
|
||
|
|
nearestPositionedAncestorOf: number[]
|
||
|
|
scrollContainerOf: number[]
|
||
|
|
stackingContextOf: number[]
|
||
|
|
formattingContextOf: number[]
|
||
|
|
clippingRootOf: number[]
|
||
|
|
paintOrderBucket: number[]
|
||
|
|
paintOrderIndex: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalScrollTable {
|
||
|
|
containerId: number[]
|
||
|
|
scrollLeft: number[]
|
||
|
|
scrollTop: number[]
|
||
|
|
scrollWidth: number[]
|
||
|
|
scrollHeight: number[]
|
||
|
|
clientWidth: number[]
|
||
|
|
clientHeight: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalClippingTable {
|
||
|
|
clipNodeId: number[]
|
||
|
|
subjectId: number[]
|
||
|
|
clipKind: number[]
|
||
|
|
clipLeft: number[]
|
||
|
|
clipTop: number[]
|
||
|
|
clipRight: number[]
|
||
|
|
clipBottom: number[]
|
||
|
|
parentClipNodeId: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalPaintTable {
|
||
|
|
paintNodeId: number[]
|
||
|
|
subjectId: number[]
|
||
|
|
stackingContextId: number[]
|
||
|
|
bucket: number[]
|
||
|
|
localPaintIndex: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalVisibilityTable {
|
||
|
|
subjectId: number[]
|
||
|
|
isRendered: number[]
|
||
|
|
isVisible: number[]
|
||
|
|
visibleArea: number[]
|
||
|
|
clippedArea: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalProvenanceTable {
|
||
|
|
factId: number[]
|
||
|
|
extractionStepId: number[]
|
||
|
|
sourceKind: number[]
|
||
|
|
sourceRef: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalConfidenceTable {
|
||
|
|
factId: number[]
|
||
|
|
confidence: number[]
|
||
|
|
reasonCode: number[]
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface CanonicalGeometryWorld {
|
||
|
|
sceneId: string
|
||
|
|
snapshotId: string
|
||
|
|
env: CanonicalWorldEnvironment
|
||
|
|
source: CanonicalWorldSource
|
||
|
|
strings: CanonicalStringTable
|
||
|
|
subjects: CanonicalSubjectsTable
|
||
|
|
dom: CanonicalDomTable
|
||
|
|
frames: CanonicalFramesTable
|
||
|
|
matrices: CanonicalMatricesTable
|
||
|
|
rects: CanonicalRectsTable
|
||
|
|
boxes: CanonicalBoxesTable
|
||
|
|
visualBoxes: CanonicalBoxesTable
|
||
|
|
fragments: CanonicalFragmentsTable
|
||
|
|
transforms: CanonicalTransformsTable
|
||
|
|
styles: CanonicalStylesTable
|
||
|
|
text: CanonicalTextTable
|
||
|
|
topology: CanonicalTopologyTable
|
||
|
|
scroll: CanonicalScrollTable
|
||
|
|
clipping: CanonicalClippingTable
|
||
|
|
paint: CanonicalPaintTable
|
||
|
|
visibility: CanonicalVisibilityTable
|
||
|
|
provenance: CanonicalProvenanceTable
|
||
|
|
confidence: CanonicalConfidenceTable
|
||
|
|
}
|
||
|
|
|
||
|
|
function adaptDom(dom: DOMExtractionResult): CanonicalDomTable {
|
||
|
|
const nodeId: number[] = []
|
||
|
|
const backendNodeId: number[] = []
|
||
|
|
const parentNodeId: number[] = []
|
||
|
|
const firstChildIndex: number[] = []
|
||
|
|
const childCount: number[] = []
|
||
|
|
const shadowRootKind: number[] = []
|
||
|
|
const tagNameStringId: number[] = []
|
||
|
|
const roleStringId: number[] = []
|
||
|
|
const ariaNameStringId: number[] = []
|
||
|
|
|
||
|
|
for (const node of dom.nodes) {
|
||
|
|
nodeId.push(node.nodeId)
|
||
|
|
backendNodeId.push(node.backendNodeId)
|
||
|
|
parentNodeId.push(node.parentNodeId)
|
||
|
|
firstChildIndex.push(node.firstChildIndex)
|
||
|
|
childCount.push(node.childCount)
|
||
|
|
shadowRootKind.push(node.shadowRootKind === 'open' ? 1 : node.shadowRootKind === 'closed' ? 2 : 0)
|
||
|
|
tagNameStringId.push(node.tagNameStringId)
|
||
|
|
roleStringId.push(node.roleStringId)
|
||
|
|
ariaNameStringId.push(node.ariaNameStringId)
|
||
|
|
}
|
||
|
|
|
||
|
|
return {
|
||
|
|
nodeId,
|
||
|
|
backendNodeId,
|
||
|
|
parentNodeId,
|
||
|
|
firstChildIndex,
|
||
|
|
childCount,
|
||
|
|
shadowRootKind,
|
||
|
|
tagNameStringId,
|
||
|
|
roleStringId,
|
||
|
|
ariaNameStringId,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function adaptStringTable(strings: string[]): CanonicalStringTable {
|
||
|
|
return { values: strings }
|
||
|
|
}
|
||
|
|
|
||
|
|
function adaptTimestamp(iso: string): number {
|
||
|
|
return new Date(iso).getTime()
|
||
|
|
}
|
||
|
|
|
||
|
|
function emptyTextTable(): CanonicalTextTable {
|
||
|
|
return {
|
||
|
|
runId: [],
|
||
|
|
subjectId: [],
|
||
|
|
contentStringId: [],
|
||
|
|
lineBoxId: [],
|
||
|
|
inkLeft: [],
|
||
|
|
inkTop: [],
|
||
|
|
inkRight: [],
|
||
|
|
inkBottom: [],
|
||
|
|
baselineY: [],
|
||
|
|
capHeight: [],
|
||
|
|
computedLineHeight: [],
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function emptyPaintTable(): CanonicalPaintTable {
|
||
|
|
return {
|
||
|
|
paintNodeId: [],
|
||
|
|
subjectId: [],
|
||
|
|
stackingContextId: [],
|
||
|
|
bucket: [],
|
||
|
|
localPaintIndex: [],
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function emptyVisibilityTable(subjectCount: number): CanonicalVisibilityTable {
|
||
|
|
return {
|
||
|
|
subjectId: Array.from({ length: subjectCount }, (_, i) => i),
|
||
|
|
isRendered: Array(subjectCount).fill(1),
|
||
|
|
isVisible: Array(subjectCount).fill(1),
|
||
|
|
visibleArea: Array(subjectCount).fill(0),
|
||
|
|
clippedArea: Array(subjectCount).fill(0),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Convert a CDP GeometryWorldSnapshot into the canonical GeometryWorld shape.
|
||
|
|
*
|
||
|
|
* This is a lossless structural mapping: CDP-specific structures (like the
|
||
|
|
* DOM tree object) are flattened into columnar arrays, and ISO timestamps
|
||
|
|
* are converted to epoch millis.
|
||
|
|
*/
|
||
|
|
export function adaptSnapshotToCanonical(snapshot: GeometryWorldSnapshot): CanonicalGeometryWorld {
|
||
|
|
return {
|
||
|
|
sceneId: snapshot.sceneId,
|
||
|
|
snapshotId: snapshot.snapshotId,
|
||
|
|
env: snapshot.env,
|
||
|
|
source: {
|
||
|
|
url: snapshot.source.url,
|
||
|
|
browserName: snapshot.source.browserName,
|
||
|
|
browserVersion: snapshot.source.browserVersion,
|
||
|
|
engine: 'chromium-cdp',
|
||
|
|
extractedAt: adaptTimestamp(snapshot.source.extractedAt),
|
||
|
|
},
|
||
|
|
strings: adaptStringTable(snapshot.strings),
|
||
|
|
subjects: snapshot.subjects,
|
||
|
|
dom: adaptDom(snapshot.dom),
|
||
|
|
frames: snapshot.frames,
|
||
|
|
matrices: snapshot.matrices,
|
||
|
|
rects: snapshot.rects,
|
||
|
|
boxes: snapshot.boxes,
|
||
|
|
visualBoxes: snapshot.visualBoxes,
|
||
|
|
fragments: snapshot.fragments,
|
||
|
|
transforms: snapshot.transforms,
|
||
|
|
styles: snapshot.styles,
|
||
|
|
text: emptyTextTable(),
|
||
|
|
topology: snapshot.topology,
|
||
|
|
scroll: {
|
||
|
|
containerId: snapshot.scroll.map((s) => s.containerId),
|
||
|
|
scrollLeft: snapshot.scroll.map((s) => s.scrollLeft),
|
||
|
|
scrollTop: snapshot.scroll.map((s) => s.scrollTop),
|
||
|
|
scrollWidth: snapshot.scroll.map((s) => s.scrollWidth),
|
||
|
|
scrollHeight: snapshot.scroll.map((s) => s.scrollHeight),
|
||
|
|
clientWidth: snapshot.scroll.map((s) => s.clientWidth),
|
||
|
|
clientHeight: snapshot.scroll.map((s) => s.clientHeight),
|
||
|
|
},
|
||
|
|
clipping: {
|
||
|
|
clipNodeId: snapshot.clipping.map((c) => c.clipNodeId),
|
||
|
|
subjectId: snapshot.clipping.map((c) => c.subjectId),
|
||
|
|
clipKind: snapshot.clipping.map((c) => c.clipKind),
|
||
|
|
clipLeft: snapshot.clipping.map((c) => c.clipLeft),
|
||
|
|
clipTop: snapshot.clipping.map((c) => c.clipTop),
|
||
|
|
clipRight: snapshot.clipping.map((c) => c.clipRight),
|
||
|
|
clipBottom: snapshot.clipping.map((c) => c.clipBottom),
|
||
|
|
parentClipNodeId: snapshot.clipping.map((c) => c.parentClipNodeId),
|
||
|
|
},
|
||
|
|
paint: emptyPaintTable(),
|
||
|
|
visibility: emptyVisibilityTable(snapshot.subjects.ids.length),
|
||
|
|
provenance: {
|
||
|
|
factId: snapshot.provenance.map((p) => p.factId),
|
||
|
|
extractionStepId: snapshot.provenance.map((p) => p.extractionStepId),
|
||
|
|
sourceKind: snapshot.provenance.map((p) => p.sourceKind),
|
||
|
|
sourceRef: snapshot.provenance.map((p) => p.sourceRef),
|
||
|
|
},
|
||
|
|
confidence: {
|
||
|
|
factId: snapshot.confidence.map((c) => c.factId),
|
||
|
|
confidence: snapshot.confidence.map((c) => c.confidence),
|
||
|
|
reasonCode: snapshot.confidence.map((c) => c.reasonCode),
|
||
|
|
},
|
||
|
|
}
|
||
|
|
}
|