344 lines
10 KiB
TypeScript
344 lines
10 KiB
TypeScript
|
|
// World materialization from extraction results
|
||
|
|
// Orchestrates normalization into an immutable geometry world
|
||
|
|
|
||
|
|
import {
|
||
|
|
GeometryWorld,
|
||
|
|
StringTable,
|
||
|
|
Matrices,
|
||
|
|
buildWorldIndex,
|
||
|
|
WorldIndex,
|
||
|
|
} from './world.js'
|
||
|
|
import {
|
||
|
|
RawExtractionResult,
|
||
|
|
normalizeEnv,
|
||
|
|
normalizeSource,
|
||
|
|
normalizeSubjects,
|
||
|
|
normalizeDom,
|
||
|
|
normalizeFrames,
|
||
|
|
normalizeRects,
|
||
|
|
normalizeBoxes,
|
||
|
|
normalizeFragments,
|
||
|
|
normalizeTransforms,
|
||
|
|
normalizeStyles,
|
||
|
|
normalizeText,
|
||
|
|
normalizeTopology,
|
||
|
|
normalizeScroll,
|
||
|
|
normalizeClipping,
|
||
|
|
normalizePaint,
|
||
|
|
normalizeVisibility,
|
||
|
|
normalizeProvenance,
|
||
|
|
normalizeConfidence,
|
||
|
|
} from './normalize.js'
|
||
|
|
import { IDENTITY_4X4 } from './transforms.js'
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// Materialization
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Materialize a geometry world from raw extraction results.
|
||
|
|
* The returned world is immutable and fully indexed.
|
||
|
|
*/
|
||
|
|
export function materializeWorld(raw: RawExtractionResult): GeometryWorld {
|
||
|
|
const strings: StringTable = { values: [] }
|
||
|
|
|
||
|
|
const env = normalizeEnv(raw.env, strings)
|
||
|
|
const source = normalizeSource(raw.source, strings)
|
||
|
|
|
||
|
|
const sceneId = strings.values.indexOf(raw.sceneId)
|
||
|
|
const snapshotId = strings.values.indexOf(raw.snapshotId)
|
||
|
|
|
||
|
|
const subjects = normalizeSubjects(raw.subjects)
|
||
|
|
const dom = normalizeDom(raw.dom, strings)
|
||
|
|
const rects = normalizeRects(raw.rects)
|
||
|
|
const boxes = normalizeBoxes(raw.boxes)
|
||
|
|
const fragments = normalizeFragments(raw.fragments)
|
||
|
|
|
||
|
|
let matrices: Matrices = { values: new Float64Array(0) }
|
||
|
|
|
||
|
|
const framesResult = normalizeFrames(raw.frames, strings, matrices)
|
||
|
|
const frames = framesResult.frames
|
||
|
|
matrices = framesResult.matrices
|
||
|
|
|
||
|
|
const transformsResult = normalizeTransforms(raw.transforms, matrices)
|
||
|
|
const transforms = transformsResult.transforms
|
||
|
|
matrices = transformsResult.matrices
|
||
|
|
|
||
|
|
const styles = normalizeStyles(raw.styles, strings)
|
||
|
|
const text = normalizeText(raw.text, strings)
|
||
|
|
const topology = normalizeTopology(raw.topology, raw.subjects.length)
|
||
|
|
const scroll = normalizeScroll(raw.scroll)
|
||
|
|
const clipping = normalizeClipping(raw.clipping, strings)
|
||
|
|
const paint = normalizePaint(raw.paint)
|
||
|
|
const visibility = normalizeVisibility(raw.visibility)
|
||
|
|
const provenance = normalizeProvenance(raw.provenance, strings)
|
||
|
|
const confidence = normalizeConfidence(raw.confidence, strings)
|
||
|
|
|
||
|
|
const world: GeometryWorld = {
|
||
|
|
sceneId: sceneId >= 0 ? sceneId : strings.values.push(raw.sceneId) - 1,
|
||
|
|
snapshotId: snapshotId >= 0 ? snapshotId : strings.values.push(raw.snapshotId) - 1,
|
||
|
|
env,
|
||
|
|
source,
|
||
|
|
strings,
|
||
|
|
subjects,
|
||
|
|
dom,
|
||
|
|
frames,
|
||
|
|
matrices,
|
||
|
|
rects,
|
||
|
|
boxes,
|
||
|
|
fragments,
|
||
|
|
transforms,
|
||
|
|
styles,
|
||
|
|
text,
|
||
|
|
topology,
|
||
|
|
scroll,
|
||
|
|
clipping,
|
||
|
|
paint,
|
||
|
|
visibility,
|
||
|
|
provenance,
|
||
|
|
confidence,
|
||
|
|
selectorIndex: new Map<string, number[]>(),
|
||
|
|
ancestorIndex: new Map<number, number[]>(),
|
||
|
|
lineBoxIndex: new Map<number, number[]>(),
|
||
|
|
textRunIndex: new Map<number, number[]>(),
|
||
|
|
}
|
||
|
|
|
||
|
|
buildDomainIndexes(world)
|
||
|
|
|
||
|
|
return world
|
||
|
|
}
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// Domain index construction (V1.1)
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
function buildDomainIndexes(world: GeometryWorld): void {
|
||
|
|
// Build selector index: tag names and class selectors for element subjects.
|
||
|
|
const selectorIndex = world.selectorIndex
|
||
|
|
const domNodeById = new Map<number, number>()
|
||
|
|
for (let i = 0; i < world.dom.nodeId.length; i++) {
|
||
|
|
domNodeById.set(world.dom.nodeId[i], i)
|
||
|
|
}
|
||
|
|
|
||
|
|
for (let i = 0; i < world.subjects.ids.length; i++) {
|
||
|
|
const subjectId = world.subjects.ids[i]
|
||
|
|
const kind = world.subjects.subjectKind[i]
|
||
|
|
const domNodeId = world.subjects.domNodeId[i]
|
||
|
|
if (kind !== 1 /* SubjectKind.Element */ || domNodeId === 0) continue
|
||
|
|
|
||
|
|
const domIdx = domNodeById.get(domNodeId)
|
||
|
|
if (domIdx === undefined) continue
|
||
|
|
|
||
|
|
// Tag selector
|
||
|
|
const tagName = world.strings.values[world.dom.tagNameStringId[domIdx]]
|
||
|
|
if (tagName) {
|
||
|
|
const tagSelector = tagName.toLowerCase()
|
||
|
|
const tagList = selectorIndex.get(tagSelector) ?? []
|
||
|
|
tagList.push(subjectId)
|
||
|
|
selectorIndex.set(tagSelector, tagList)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Class selectors
|
||
|
|
const classIds = world.dom.classNameStringIds[domIdx]
|
||
|
|
for (let c = 0; c < classIds.length; c++) {
|
||
|
|
const className = world.strings.values[classIds[c]]
|
||
|
|
if (className) {
|
||
|
|
const classSelector = '.' + className.toLowerCase()
|
||
|
|
const classList = selectorIndex.get(classSelector) ?? []
|
||
|
|
classList.push(subjectId)
|
||
|
|
selectorIndex.set(classSelector, classList)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Sort all selector lists for deterministic enumeration
|
||
|
|
for (const [key, ids] of selectorIndex) {
|
||
|
|
ids.sort((a, b) => a - b)
|
||
|
|
selectorIndex.set(key, ids)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Build ancestor index: element id -> sorted descendant ids (transitive)
|
||
|
|
const ancestorIndex = world.ancestorIndex
|
||
|
|
const childrenByParent = new Map<number, number[]>()
|
||
|
|
for (let i = 0; i < world.dom.nodeId.length; i++) {
|
||
|
|
const parentId = world.dom.parentNodeId[i]
|
||
|
|
if (parentId !== 0) {
|
||
|
|
const arr = childrenByParent.get(parentId) ?? []
|
||
|
|
arr.push(world.dom.nodeId[i])
|
||
|
|
childrenByParent.set(parentId, arr)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Map DOM node IDs to subject IDs for elements
|
||
|
|
const subjectByDomNodeId = new Map<number, number>()
|
||
|
|
for (let i = 0; i < world.subjects.ids.length; i++) {
|
||
|
|
const domNodeId = world.subjects.domNodeId[i]
|
||
|
|
if (domNodeId !== 0) {
|
||
|
|
subjectByDomNodeId.set(domNodeId, world.subjects.ids[i])
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function collectDescendants(domNodeId: number, out: number[]): void {
|
||
|
|
const childDomIds = childrenByParent.get(domNodeId)
|
||
|
|
if (!childDomIds) return
|
||
|
|
for (const childDomId of childDomIds) {
|
||
|
|
const childSubjectId = subjectByDomNodeId.get(childDomId)
|
||
|
|
if (childSubjectId !== undefined) {
|
||
|
|
out.push(childSubjectId)
|
||
|
|
collectDescendants(childDomId, out)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
for (let i = 0; i < world.dom.nodeId.length; i++) {
|
||
|
|
const domNodeId = world.dom.nodeId[i]
|
||
|
|
const subjectId = subjectByDomNodeId.get(domNodeId)
|
||
|
|
if (subjectId !== undefined) {
|
||
|
|
const descendants: number[] = []
|
||
|
|
collectDescendants(domNodeId, descendants)
|
||
|
|
descendants.sort((a, b) => a - b)
|
||
|
|
ancestorIndex.set(subjectId, descendants)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Build lineBoxIndex: text node subject id -> line box fragment ids
|
||
|
|
const lineBoxIndex = world.lineBoxIndex
|
||
|
|
for (let i = 0; i < world.fragments.fragmentId.length; i++) {
|
||
|
|
if (world.fragments.fragmentKind[i] === 1 /* FragmentKind.Line */) {
|
||
|
|
const subjectId = world.fragments.subjectId[i]
|
||
|
|
const arr = lineBoxIndex.get(subjectId) ?? []
|
||
|
|
arr.push(world.fragments.fragmentId[i])
|
||
|
|
lineBoxIndex.set(subjectId, arr)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
for (const [key, ids] of lineBoxIndex) {
|
||
|
|
ids.sort((a, b) => a - b)
|
||
|
|
lineBoxIndex.set(key, ids)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Build textRunIndex: text node subject id -> text run ids
|
||
|
|
const textRunIndex = world.textRunIndex
|
||
|
|
for (let i = 0; i < world.text.runId.length; i++) {
|
||
|
|
const subjectId = world.text.subjectId[i]
|
||
|
|
const arr = textRunIndex.get(subjectId) ?? []
|
||
|
|
arr.push(world.text.runId[i])
|
||
|
|
textRunIndex.set(subjectId, arr)
|
||
|
|
}
|
||
|
|
for (const [key, ids] of textRunIndex) {
|
||
|
|
ids.sort((a, b) => a - b)
|
||
|
|
textRunIndex.set(key, ids)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Materialize a world and build its index in one call.
|
||
|
|
*/
|
||
|
|
export function materializeWorldWithIndex(raw: RawExtractionResult): {
|
||
|
|
world: GeometryWorld
|
||
|
|
index: WorldIndex
|
||
|
|
} {
|
||
|
|
const world = materializeWorld(raw)
|
||
|
|
const index = buildWorldIndex(world)
|
||
|
|
return { world, index }
|
||
|
|
}
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// World validation
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
export interface WorldValidationError {
|
||
|
|
code: string
|
||
|
|
message: string
|
||
|
|
table: string
|
||
|
|
index: number
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Validate a geometry world for structural integrity.
|
||
|
|
* Returns a list of validation errors (empty if valid).
|
||
|
|
*/
|
||
|
|
export function validateWorld(world: GeometryWorld): WorldValidationError[] {
|
||
|
|
const errors: WorldValidationError[] = []
|
||
|
|
|
||
|
|
// Check that subjects have valid domNodeId references
|
||
|
|
for (let i = 0; i < world.subjects.ids.length; i++) {
|
||
|
|
const domNodeId = world.subjects.domNodeId[i]
|
||
|
|
if (domNodeId === 0) continue
|
||
|
|
let found = false
|
||
|
|
for (let j = 0; j < world.dom.nodeId.length; j++) {
|
||
|
|
if (world.dom.nodeId[j] === domNodeId) {
|
||
|
|
found = true
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (!found) {
|
||
|
|
errors.push({
|
||
|
|
code: 'INVALID_DOM_REF',
|
||
|
|
message: `Subject ${world.subjects.ids[i]} references missing domNodeId ${domNodeId}`,
|
||
|
|
table: 'subjects',
|
||
|
|
index: i,
|
||
|
|
})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check that boxes have valid subject and frame references
|
||
|
|
for (let i = 0; i < world.boxes.boxId.length; i++) {
|
||
|
|
const subjectId = world.boxes.subjectId[i]
|
||
|
|
let foundSubject = false
|
||
|
|
for (let j = 0; j < world.subjects.ids.length; j++) {
|
||
|
|
if (world.subjects.ids[j] === subjectId) {
|
||
|
|
foundSubject = true
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (!foundSubject) {
|
||
|
|
errors.push({
|
||
|
|
code: 'INVALID_SUBJECT_REF',
|
||
|
|
message: `Box ${world.boxes.boxId[i]} references missing subjectId ${subjectId}`,
|
||
|
|
table: 'boxes',
|
||
|
|
index: i,
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
const frameId = world.boxes.frameId[i]
|
||
|
|
let foundFrame = false
|
||
|
|
for (let j = 0; j < world.frames.frameId.length; j++) {
|
||
|
|
if (world.frames.frameId[j] === frameId) {
|
||
|
|
foundFrame = true
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (!foundFrame) {
|
||
|
|
errors.push({
|
||
|
|
code: 'INVALID_FRAME_REF',
|
||
|
|
message: `Box ${world.boxes.boxId[i]} references missing frameId ${frameId}`,
|
||
|
|
table: 'boxes',
|
||
|
|
index: i,
|
||
|
|
})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check that frames have valid parent references
|
||
|
|
for (let i = 0; i < world.frames.frameId.length; i++) {
|
||
|
|
const parentId = world.frames.parentFrameId[i]
|
||
|
|
if (parentId === 0) continue
|
||
|
|
let found = false
|
||
|
|
for (let j = 0; j < world.frames.frameId.length; j++) {
|
||
|
|
if (world.frames.frameId[j] === parentId) {
|
||
|
|
found = true
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (!found) {
|
||
|
|
errors.push({
|
||
|
|
code: 'INVALID_PARENT_FRAME',
|
||
|
|
message: `Frame ${world.frames.frameId[i]} references missing parentFrameId ${parentId}`,
|
||
|
|
table: 'frames',
|
||
|
|
index: i,
|
||
|
|
})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return errors
|
||
|
|
}
|