Files
Imhotep/packages/imhotep-core/src/geometry-cache.ts
T

430 lines
15 KiB
TypeScript
Raw Normal View History

/**
* Geometry snapshot caching for Imhotep.
*
* Serializes and deserializes GeometryWorld instances so extracted
* browser geometry can be cached to disk and reused across assertion
* batches without re-extraction.
*/
import type { GeometryWorld, WorldEnvironment, WorldSource } from './world.js'
import type { ImhotepId } from './types.js'
// ---------------------------------------------------------------------------
// Serialization
// ---------------------------------------------------------------------------
interface SerializedTypedArray {
type: 'Uint32Array' | 'Uint16Array' | 'Uint8Array' | 'Int32Array' | 'Int16Array' | 'Float64Array' | 'Float32Array'
data: number[]
}
interface SerializedTable {
[column: string]: SerializedTypedArray | string[] | number[]
}
interface SerializedWorld {
version: number
sceneId: ImhotepId
snapshotId: ImhotepId
env: WorldEnvironment
source: WorldSource
strings: { values: string[] }
subjects: SerializedTable
dom: SerializedTable
frames: SerializedTable
matrices: SerializedTable
rects: SerializedTable
boxes: SerializedTable
visualBoxes: SerializedTable
fragments: SerializedTable
transforms: SerializedTable
styles: SerializedTable
text: SerializedTable
topology: SerializedTable
scroll: SerializedTable
clipping: SerializedTable
paint: SerializedTable
visibility: SerializedTable
provenance: SerializedTable
confidence: SerializedTable
}
function serializeTypedArray(arr: TypedArray): SerializedTypedArray {
const ctor = arr.constructor.name as SerializedTypedArray['type']
return { type: ctor, data: Array.from(arr) }
}
function deserializeTypedArray(serialized: SerializedTypedArray): TypedArray {
const ctor = globalThis[serialized.type] as new (data: number[]) => TypedArray
return new ctor(serialized.data)
}
type TypedArray =
| Uint32Array
| Uint16Array
| Uint8Array
| Int32Array
| Int16Array
| Float64Array
| Float32Array
function isTypedArray(value: unknown): value is TypedArray {
return (
value instanceof Uint32Array ||
value instanceof Uint16Array ||
value instanceof Uint8Array ||
value instanceof Int32Array ||
value instanceof Int16Array ||
value instanceof Float64Array ||
value instanceof Float32Array
)
}
function serializeTable(table: Record<string, unknown>): SerializedTable {
const result: SerializedTable = {}
if (!table) return result
for (const [key, value] of Object.entries(table)) {
if (isTypedArray(value)) {
result[key] = serializeTypedArray(value)
} else if (Array.isArray(value)) {
result[key] = value as number[] | string[]
} else if (typeof value === 'number' || typeof value === 'string' || typeof value === 'boolean') {
result[key] = [value] as number[] | string[]
} else {
result[key] = value as SerializedTable[string]
}
}
return result
}
function deserializeTable(serialized: SerializedTable): Record<string, unknown> {
const result: Record<string, unknown> = {}
for (const [key, value] of Object.entries(serialized)) {
if (value && typeof value === 'object' && 'type' in value && 'data' in value) {
result[key] = deserializeTypedArray(value as SerializedTypedArray)
} else if (Array.isArray(value)) {
result[key] = value
} else {
result[key] = value
}
}
return result
}
/**
* Serialize a GeometryWorld to a JSON string.
*/
export function serializeGeometryWorld(world: GeometryWorld): string {
const serialized: SerializedWorld = {
version: 1,
sceneId: world.sceneId,
snapshotId: world.snapshotId,
env: world.env,
source: world.source,
strings: world.strings,
subjects: serializeTable(world.subjects as unknown as Record<string, unknown>),
dom: serializeTable(world.dom as unknown as Record<string, unknown>),
frames: serializeTable(world.frames as unknown as Record<string, unknown>),
matrices: serializeTable(world.matrices as unknown as Record<string, unknown>),
rects: serializeTable(world.rects as unknown as Record<string, unknown>),
boxes: serializeTable(world.boxes as unknown as Record<string, unknown>),
visualBoxes: serializeTable(world.visualBoxes as unknown as Record<string, unknown>),
fragments: serializeTable(world.fragments as unknown as Record<string, unknown>),
transforms: serializeTable(world.transforms as unknown as Record<string, unknown>),
styles: serializeTable(world.styles as unknown as Record<string, unknown>),
text: serializeTable(world.text as unknown as Record<string, unknown>),
topology: serializeTable(world.topology as unknown as Record<string, unknown>),
scroll: serializeTable(world.scroll as unknown as Record<string, unknown>),
clipping: serializeTable(world.clipping as unknown as Record<string, unknown>),
paint: serializeTable(world.paint as unknown as Record<string, unknown>),
visibility: serializeTable(world.visibility as unknown as Record<string, unknown>),
provenance: serializeTable(world.provenance as unknown as Record<string, unknown>),
confidence: serializeTable(world.confidence as unknown as Record<string, unknown>),
}
return JSON.stringify(serialized)
}
/**
* Deserialize a JSON string back into a GeometryWorld.
*/
export function deserializeGeometryWorld(json: string): GeometryWorld {
const serialized = JSON.parse(json) as SerializedWorld
if (serialized.version !== 1) {
throw new Error(`Unsupported geometry cache version: ${serialized.version}`)
}
return {
sceneId: serialized.sceneId,
snapshotId: serialized.snapshotId,
env: serialized.env,
source: serialized.source,
strings: serialized.strings,
subjects: deserializeTable(serialized.subjects) as unknown as GeometryWorld['subjects'],
dom: deserializeTable(serialized.dom) as unknown as GeometryWorld['dom'],
frames: deserializeTable(serialized.frames) as unknown as GeometryWorld['frames'],
matrices: deserializeTable(serialized.matrices) as unknown as GeometryWorld['matrices'],
rects: deserializeTable(serialized.rects) as unknown as GeometryWorld['rects'],
boxes: deserializeTable(serialized.boxes) as unknown as GeometryWorld['boxes'],
visualBoxes: deserializeTable(serialized.visualBoxes) as unknown as GeometryWorld['visualBoxes'],
fragments: deserializeTable(serialized.fragments) as unknown as GeometryWorld['fragments'],
transforms: deserializeTable(serialized.transforms) as unknown as GeometryWorld['transforms'],
styles: deserializeTable(serialized.styles) as unknown as GeometryWorld['styles'],
text: deserializeTable(serialized.text) as unknown as GeometryWorld['text'],
topology: deserializeTable(serialized.topology) as unknown as GeometryWorld['topology'],
scroll: deserializeTable(serialized.scroll) as unknown as GeometryWorld['scroll'],
clipping: deserializeTable(serialized.clipping) as unknown as GeometryWorld['clipping'],
paint: deserializeTable(serialized.paint) as unknown as GeometryWorld['paint'],
visibility: deserializeTable(serialized.visibility) as unknown as GeometryWorld['visibility'],
provenance: deserializeTable(serialized.provenance) as unknown as GeometryWorld['provenance'],
confidence: deserializeTable(serialized.confidence) as unknown as GeometryWorld['confidence'],
}
}
// ---------------------------------------------------------------------------
// Cache key computation
// ---------------------------------------------------------------------------
function djb2Hash(str: string): string {
let hash = 5381
for (let i = 0; i < str.length; i++) {
hash = ((hash << 5) + hash + str.charCodeAt(i)) & 0xffffffff
}
return hash.toString(16).padStart(8, '0')
}
/**
* Schema version for cache invalidation. Increment when the world schema
* changes in a way that makes previously cached extraction results incompatible.
*/
export const WORLD_CACHE_SCHEMA_VERSION = 2
/**
* Compute a stable cache key for a geometry extraction.
*
* The key incorporates:
* - schema version (automatic invalidation on schema changes)
* - page URL
* - sorted selectors (so order doesn't matter)
* - environment (viewport dimensions, color scheme, etc.)
*/
export function computeGeometryCacheKey(
pageUrl: string,
selectors: string[],
env: Partial<WorldEnvironment>,
): string {
const selectorsHash = djb2Hash([...selectors].sort().join('\x00'))
const envHash = djb2Hash(
JSON.stringify({
viewportWidth: env.viewportWidth ?? 0,
viewportHeight: env.viewportHeight ?? 0,
deviceScaleFactor: env.deviceScaleFactor ?? 1,
colorScheme: env.colorScheme ?? 'light',
pointer: env.pointer ?? 'fine',
hover: env.hover ?? false,
reducedMotion: env.reducedMotion ?? 'no-preference',
locale: env.locale ?? 'en',
writingMode: env.writingMode ?? 'horizontal-tb',
}),
)
const urlHash = djb2Hash(pageUrl)
return `${WORLD_CACHE_SCHEMA_VERSION}-${urlHash}-${selectorsHash}-${envHash}`
}
// ---------------------------------------------------------------------------
// File-based cache storage
// ---------------------------------------------------------------------------
import { existsSync, mkdirSync, rmSync, readdirSync, statSync } from 'node:fs'
import { readFile, writeFile, unlink } from 'node:fs/promises'
import { join } from 'node:path'
const DEFAULT_CACHE_DIR = join(process.cwd(), '.imhotep-cache')
const DEFAULT_MAX_CACHE_ENTRIES = 100
function cacheFilePath(cacheDir: string, cacheKey: string): string {
return join(cacheDir, `${cacheKey}.json`)
}
/**
* Evict oldest cache entries when exceeding maxEntries.
*/
function evictOldestEntries(cacheDir: string, maxEntries: number): void {
if (!existsSync(cacheDir)) return
const files = readdirSync(cacheDir)
.filter((f) => f.endsWith('.json'))
.map((f) => {
const fullPath = join(cacheDir, f)
try {
return { name: f, path: fullPath, mtime: statSync(fullPath).mtimeMs }
} catch (err) {
console.warn(`[imhotep-core] cache eviction: stat failed for ${f}: ${err instanceof Error ? err.message : err}`)
return null
}
})
.filter((f): f is NonNullable<typeof f> => f !== null)
.sort((a, b) => a.mtime - b.mtime)
if (files.length > maxEntries) {
const toRemove = files.length - maxEntries
for (let i = 0; i < toRemove; i++) {
try {
rmSync(files[i].path)
} catch (err) {
console.warn(`[imhotep-core] cache eviction: failed to remove ${files[i].path}: ${err instanceof Error ? err.message : err}`)
}
}
}
}
/**
* Read a cached GeometryWorld from disk (async, non-blocking).
*/
export async function readCachedWorld(cacheDir: string, cacheKey: string): Promise<GeometryWorld | null> {
const path = cacheFilePath(cacheDir, cacheKey)
try {
const json = await readFile(path, 'utf-8')
return deserializeGeometryWorld(json)
} catch (err) {
console.warn(`[imhotep-core] readCachedWorld failed for ${cacheKey}: ${err instanceof Error ? err.message : err}. Cache miss.`)
return null
}
}
/**
* Write a GeometryWorld to the cache on disk (async, non-blocking).
*/
export async function writeCachedWorld(cacheDir: string, cacheKey: string, world: GeometryWorld): Promise<void> {
mkdirSync(cacheDir, { recursive: true })
evictOldestEntries(cacheDir, DEFAULT_MAX_CACHE_ENTRIES)
const path = cacheFilePath(cacheDir, cacheKey)
await writeFile(path, serializeGeometryWorld(world))
}
/**
* Clear all cached geometry snapshots.
*/
export function clearGeometryCache(cacheDir: string = DEFAULT_CACHE_DIR): void {
if (!existsSync(cacheDir)) {
return
}
for (const file of readdirSync(cacheDir)) {
if (file.endsWith('.json')) {
try {
rmSync(join(cacheDir, file))
} catch (err) {
console.warn(`[imhotep-core] cache clear: failed to remove ${file}: ${err instanceof Error ? err.message : err}`)
}
}
}
}
// ---------------------------------------------------------------------------
// Full extraction result caching (world + selector mapping + errors)
// ---------------------------------------------------------------------------
interface CachedExtractionResult {
version: number
world: string
selectorToIds: [string, number[]][]
errors: Array<{
code: string
severity: string
category: string
message: string
source: string
related: string[]
fixHints: string[]
metrics: Record<string, unknown>
sourceRef: Record<string, unknown>
clauseLabel?: string
}>
}
/**
* Serialize a full extraction result (world + selector mappings + errors).
*/
export function serializeExtractionResult(
world: GeometryWorld,
selectorToIds: Map<string, number[]>,
errors: Array<{
code: string
severity: string
category: string
message: string
source: string
related: string[]
fixHints: string[]
metrics: Record<string, unknown>
sourceRef: Record<string, unknown>
clauseLabel?: string
}>,
): string {
const cached: CachedExtractionResult = {
version: 1,
world: serializeGeometryWorld(world),
selectorToIds: Array.from(selectorToIds.entries()),
errors,
}
return JSON.stringify(cached)
}
/**
* Deserialize a full extraction result from JSON.
*/
export function deserializeExtractionResult(json: string): {
world: GeometryWorld
selectorToIds: Map<string, number[]>
errors: CachedExtractionResult['errors']
} {
const cached = JSON.parse(json) as CachedExtractionResult
if (cached.version !== 1) {
throw new Error(`Unsupported extraction cache version: ${cached.version}`)
}
return {
world: deserializeGeometryWorld(cached.world),
selectorToIds: new Map(cached.selectorToIds),
errors: cached.errors,
}
}
/**
* Read a cached extraction result from disk (async, non-blocking).
*/
export async function readCachedExtractionResult(
cacheDir: string,
cacheKey: string,
): Promise<{ world: GeometryWorld; selectorToIds: Map<string, number[]>; errors: CachedExtractionResult['errors'] } | null> {
const path = cacheFilePath(cacheDir, cacheKey)
try {
const json = await readFile(path, 'utf-8')
return deserializeExtractionResult(json)
} catch (err) {
console.warn(`[imhotep-core] readCachedExtractionResult failed for cacheKey=${cacheKey}: ${err instanceof Error ? err.message : err}. Cache miss.`)
return null
}
}
/**
* Write an extraction result to the cache on disk (async, non-blocking).
*/
export async function writeCachedExtractionResult(
cacheDir: string,
cacheKey: string,
world: GeometryWorld,
selectorToIds: Map<string, number[]>,
errors: CachedExtractionResult['errors'],
): Promise<void> {
mkdirSync(cacheDir, { recursive: true })
evictOldestEntries(cacheDir, DEFAULT_MAX_CACHE_ENTRIES)
const path = cacheFilePath(cacheDir, cacheKey)
await writeFile(path, serializeExtractionResult(world, selectorToIds, errors))
}
/**
* Return the default cache directory path.
*/
export function getDefaultCacheDir(): string {
return DEFAULT_CACHE_DIR
}