Files
Imhotep/packages/imhotep-extractor/src/batching.ts
T

145 lines
4.6 KiB
TypeScript

/**
* Plan deduplication and batching.
*
* Multiple environment cases, state snapshots, or timeline modes may produce
* identical extraction needs. This module collapses duplicate plans and groups
* compatible requests into batches so the runtime can issue bulk calls.
*/
import type { Environment, StateSnapshot, TimelineSnapshot } from 'imhotep-core'
import type { RequiredFacts, Diagnostic } from './requirements.js'
import type { SelectorPlan } from './selector.js'
// ---------------------------------------------------------------------------
// Extraction Request Shape
// ---------------------------------------------------------------------------
/**
* A single unit of work for the extractor runtime.
*
* One request corresponds to one env case + one state plan + one timeline plan.
* The runtime may still merge multiple requests into a single browser session
* if their env and subject sets are compatible.
*/
export interface ExtractionRequest {
/** Unique id for this request. */
requestId: string
/** Environment case id. */
envCaseId: string
/** Environment parameters (viewport, colorScheme, etc). */
env: Partial<Environment>
/** State snapshots to capture under this environment. */
stateSnapshots: Array<{ id: string; kind: string; name?: string }>
/** Timeline sampling plan. */
timeline: { mode: string; samples?: number[] }
/** Subjects whose selectors must be resolved. */
subjects: SelectorPlan[]
/** Union of all facts needed by active clauses in this context. */
requiredFacts: RequiredFacts
/** Diagnostics accumulated while building this request. */
diagnostics: Diagnostic[]
}
// ---------------------------------------------------------------------------
// Deduplication
// ---------------------------------------------------------------------------
/**
* Remove duplicate extraction requests.
*
* Two requests are duplicates when their env, state snapshot ids, timeline mode,
* subject selectors, and required facts are identical. The first requestId is kept.
*/
export function deduplicateRequests(requests: ExtractionRequest[]): ExtractionRequest[] {
const seen = new Map<string, ExtractionRequest>()
const deduped: ExtractionRequest[] = []
for (const req of requests) {
const key = requestKey(req)
if (seen.has(key)) continue
seen.set(key, req)
deduped.push(req)
}
return deduped
}
/** Build a deterministic string key for an extraction request. */
function requestKey(req: ExtractionRequest): string {
return JSON.stringify({
env: req.env,
stateSnapshotIds: req.stateSnapshots.map((s) => s.id).sort(),
timelineMode: req.timeline.mode,
subjectSelectors: req.subjects.map((s) => s.selector).sort(),
requiredFacts: serializeRequiredFacts(req.requiredFacts),
})
}
/** Serialize RequiredFacts into a plain, sortable object. */
function serializeRequiredFacts(facts: RequiredFacts): Record<string, unknown> {
return {
geometry: facts.geometry,
fragments: facts.fragments,
styles: Array.from(facts.styles).sort(),
topology: Array.from(facts.topology).sort(),
text: facts.text,
scroll: facts.scroll,
clipping: facts.clipping,
paint: facts.paint,
visibility: facts.visibility,
transforms: facts.transforms,
}
}
// ---------------------------------------------------------------------------
// Batching
// ---------------------------------------------------------------------------
/**
* A batch is a group of extraction requests that can share a single browser
* session because their environments are compatible.
*
* For V1 the batching rule is simple: requests with the exact same env
* (viewport, colorScheme, pointer, etc) can be batched.
*/
export interface ExtractionBatch {
/** Id for the batch. */
batchId: string
/** Shared environment for every request in the batch. */
env: Partial<Environment>
/** Requests grouped into this batch. */
requests: ExtractionRequest[]
}
/**
* Group deduplicated requests into batches by environment compatibility.
*
* Requests with identical env objects end up in the same batch.
*/
export function batchRequests(requests: ExtractionRequest[]): ExtractionBatch[] {
const groups = new Map<string, ExtractionRequest[]>()
for (const req of requests) {
const envKey = JSON.stringify(req.env)
const group = groups.get(envKey)
if (group) {
group.push(req)
} else {
groups.set(envKey, [req])
}
}
let batchCounter = 0
const batches: ExtractionBatch[] = []
for (const [envKey, group] of groups) {
batches.push({
batchId: `batch_${++batchCounter}`,
env: JSON.parse(envKey) as Partial<Environment>,
requests: group,
})
}
return batches
}