diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..03547ed --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.venv/ +__pycache__/ +*.py[cod] +.pytest_cache/ +*.egg-info/ \ No newline at end of file diff --git a/docs/document_schema_specification.md b/docs/document_schema_specification.md new file mode 100644 index 0000000..cb12078 --- /dev/null +++ b/docs/document_schema_specification.md @@ -0,0 +1,788 @@ +# Document Schema Specification + +**Project:** LLM-Assisted Extraction of Agronomic and Ecological Experiments into Structured Data +**Layer:** Document Understanding Layer (Document Object only) +**Status:** Draft for review — intended to be frozen upon approval +**Predecessor artifact:** Raw Marker Model (`marker_adapter/raw_model.py`), frozen +**Empirical basis:** `Marker Output — Empirical Findings (Paper 1: Nutrient Cycling, Smukler et al. 2012)` + +This document is an engineering specification, not an implementation. It defines every +object in the canonical Document layer — its purpose, fields, types, relationships, +identifier rule, provenance rule, validation rules, and serialization requirements — +so that implementing the Pydantic v2 Document Object later requires translation, not +design. No parsing, normalization, or business logic is described here; only the shape +of the data those future components will populate. + +--- + +## 0. Relationship to the Raw Marker Model + +The Raw Marker Model is a lossless, uninterpreted mirror of Marker's JSON output: a +single `MarkerBlock` envelope type, `extra="allow"`, no discriminated union, no +semantic interpretation. The Document Object described here is the next layer down +the pipeline: it is produced *from* the Raw Marker Model by the Normalizer (not yet +implemented) and is the first place where **structural** interpretation occurs — +deciding what counts as a Table, a Section, a Footnote's likely attachment — while +still containing zero scientific meaning. + +Every Document-layer object therefore exists in addition to, not instead of, the Raw +Marker Model. The Raw Marker Model remains the permanent ground truth on disk; +the Document Object is a derived, queryable, typed structural view over it. This +specification assumes the Raw Marker Model is available as an immutable input and +focuses entirely on what the Normalizer must produce from it. + +--- + +## 1. Architectural Invariants + +These rules are not per-object — they govern the entire Document layer and every +object defined below conforms to them without restating them per-section. + +**1.1 Immutability.** Every Document-layer model is frozen after construction +(Pydantic v2 `model_config = ConfigDict(frozen=True)`). No object is mutated after +the Normalizer finishes building it. Corrections, re-interpretation, or review +happen in later layers (IR, Scientist Review) and never write back into the +Document Object. + +**1.2 Structural-only content.** No Document-layer object may contain a field whose +purpose is to record scientific meaning. Concretely: no `treatment`, `species`, +`observation`, `management_event`, `variable`, or `trait` field exists anywhere in +this schema, even as an optional placeholder. If a future need arises to record such +a concept, it belongs in the IR, which is built on top of — never inside — the +Document Object. + +**1.3 Deterministic identifiers.** No identifier in this schema is a UUID4 or any +other non-deterministic value. Every identifier is a pure function of stable inputs, +so that re-running the same PDF through the same Marker version and the same +Normalizer version always yields byte-identical identifiers. The exact construction +rule is given in Section 2. + +**1.4 Maximum available provenance.** Every object that originates from one or more +Marker blocks retains a `StructuralProvenance` value (Section 3.3) referencing the +originating Marker block id(s), page number, bounding box, polygon, and reading-order +position. An object is never permitted to "lose" its Marker origin even when the +Normalizer reshapes or merges multiple Marker blocks into one Document object (e.g. +turning a `SectionHeader` + `Text` pair into one normalized `Caption`). + +**1.5 Deterministic, lossless serialization.** Every model in this schema must +support `model_dump()` / `model_dump_json()` and round-trip back through +`model_validate()` without information loss, exactly as already verified for the +Raw Marker Model. Field ordering in dumped JSON is determined by declaration order +in the Pydantic model (not insertion order at runtime) to keep serialized output +byte-stable across runs. + +**1.6 Independence from downstream layers.** Nothing in this schema imports from, +references, or anticipates retrieval, LLM extraction, validation, the IR, or BETYdb +export. The Document Object's public surface is consumed by those layers, but this +schema has zero knowledge of them. + +--- + +## 2. Identifier Strategy + +**Rule.** Every Document-layer object's `id` is computed as: + +``` +id = "doc:" + sha256( document_id + "|" + canonical_path )[:16] +``` + +where `document_id` is the parent Document's own id (Section 4), and +`canonical_path` is a deterministic structural path string specific to each object +type, defined per-object below (generally derived from the originating Marker +block's own path-like id, e.g. `/page/7/Table/2`, when one exists 1:1; or, for objects +synthesized from multiple Marker blocks or with no direct Marker counterpart — such +as a parsed `TableRow` — a path built from the parent object's id plus an ordinal +position among deterministically-ordered siblings, e.g. `.../Table/2/row/3`). + +**Why a hash rather than reusing Marker's path id directly.** Marker's own ids +(`/page/7/Table/2`) are positional/index-based — `Table/2` means "third +Table-typed block encountered in that page's traversal." If a future Marker version +changes internal traversal order, encounters a new block type, or reorders block +discovery, these indices could silently shift between runs on an unchanged PDF, +producing different "stable" ids for the same content. Hashing a path that is +itself still derived from Marker's structure, combined with the document id, +preserves determinism for a fixed Marker/adapter version while making the contract +explicit: **stability is guaranteed within one Marker version, not promised across +Marker upgrades.** The original Marker path is never discarded — it is preserved +verbatim inside every object's `StructuralProvenance.marker_block_id` — so a Marker +version bump that changes traversal order is detectable (ids change) and +diagnosable (provenance still shows the old vs. new Marker ids). + +**document_id construction.** `document_id = "betydoc:" + sha256(source_pdf_identifier)[:16]`, +where `source_pdf_identifier` is a stable external identifier for the source PDF +(DOI if known, else a content hash of the source PDF bytes). This deliberately +excludes Marker version and timestamp from the identity computation: the same PDF +must always resolve to the same `document_id` so that re-processing (e.g. after a +Normalizer bug fix) updates the same logical Document Object rather than minting an +unrelated one. Marker version and processing time are recorded as **metadata about +the materialization**, not folded into identity — see `ProcessingMetadata` (Section +6). + +**Properties guaranteed by this scheme:** +- Same PDF + same Marker version + same Normalizer version ⇒ identical ids + throughout the tree. +- Ids are opaque strings, safe to use as dictionary keys, filenames, or database + foreign keys. +- Every id is traceable backward to a concrete Marker block via + `StructuralProvenance`, satisfying invariant 1.4. + +--- + +## 3. Foundational Supporting Types + +These are not top-level entities; they are embedded value objects used throughout +the schema. + +### 3.1 BoundingBox + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `x0` | float | yes | Marker-observed | Left edge | +| `y0` | float | yes | Marker-observed | Top edge | +| `x1` | float | yes | Marker-observed | Right edge | +| `y1` | float | yes | Marker-observed | Bottom edge | + +Directly carried over from Marker's `bbox` (already typed as `MarkerBBox` in the Raw +Marker Model). Retained at the Document layer because footnote-to-table attachment, +evidence highlighting in the Scientist Review UI, and any future geometric +reconstruction (e.g. merged-cell heuristics) all require it. **Invariant:** `x1 >= +x0` and `y1 >= y0`; the Normalizer is responsible for not constructing a violating +instance, but the model also validates this on construction since the cost of +allowing silently-inverted boxes downstream is high (evidence UI would render +boxes wrong with no error signal). + +### 3.2 Polygon + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `points` | list of 4 `(float, float)` pairs | yes | Marker-observed | Carried over from Marker's `polygon` | + +Retained even though `BoundingBox` is derivable from it, because Marker provides +both independently and the polygon can in principle capture skew that an +axis-aligned bbox cannot. This is a direct empirical carry-over (already present and +typed in the Raw Marker Model) rather than a new design — Document-layer objects +simply forward it unchanged. No Document-layer object computes one from the other; +both come from Marker as-is. + +### 3.3 StructuralProvenance + +This is the single most important supporting type in the schema — it is what +satisfies invariant 1.4 for every object below. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `marker_block_ids` | list of str | yes (min length 1) | Marker-observed | The originating Marker block id(s), e.g. `["/page/7/Table/2"]`. A list, not a single value, because some Document objects (e.g. a normalized `Caption` under Pattern B) are synthesized from more than one Marker block. | +| `page_number` | int | yes | Marker-observed | The PDF page this object originates from. For objects spanning conceptually across the synthesis of multiple Marker blocks, this is the page of the primary/first contributing block. | +| `bbox` | `BoundingBox` | no | Marker-observed | Present for any object with a single, well-defined originating region. Absent for objects synthesized from multiple non-adjacent blocks where a single bbox would be misleading (e.g. a Pattern-B caption combining a `SectionHeader` far above a trailing `Text` note) — in that case `contributing_bboxes` is populated instead. | +| `contributing_bboxes` | list of `BoundingBox` | no | Marker-observed | Used instead of (or in addition to) `bbox` when more than one Marker block contributes geometry, preserving each one rather than collapsing them into a single misleading box. | +| `polygon` | `Polygon` | no | Marker-observed | Mirrors `bbox`'s optionality logic. | +| `reading_order_index` | int | yes | Architectural requirement | The object's position in the document's global linear reading order (Section 3.4). Required on every provenance instance because every structural object has a place in reading order even if its bbox is ambiguous. | +| `section_path` | list of str | yes (may be empty) | Marker-observed (derived) | The chain of governing `SectionHeader` Marker-block ids from Marker's own `section_hierarchy` map, ordered outermost to innermost. Empty only for objects outside any section (e.g. a journal wrapper page's `Picture`). | + +**Why a list of Marker block ids rather than exactly one.** Empirically, not every +Document-layer concept maps 1:1 to a Marker block. The clearest case is `Table` +captions: under Pattern A (`TableGroup`), the caption is one `Caption` block; under +Pattern B (bare `Table`), the equivalent information is split across a +`SectionHeader` block and a `Text` block, sometimes with a trailing "Note:" `Text` +block. Forcing a single-id provenance field would require silently picking one +contributing block and losing the others. A list preserves all of them, satisfying +invariant 1.4 even when normalization merges several Marker blocks into one +Document concept. + +### 3.4 Reading Order + +Reading order is **not** a field on a supporting type — it is a global integer +sequence assigned by the Normalizer to every leaf and container object during +construction, equal to that object's position in a single depth-first traversal of +the final Document Object tree, in `children` array order. + +This decision is made explicitly here because it was a confirmed empirical finding, +not a default assumption: Marker's own block id local-index numbers (e.g. the +trailing `4` in `/page/7/Footnote/4`) are **not** monotonic with true reading order — +`Footnote/4` and `Footnote/5` physically appear, in the actual children array, after +`Table/8`, despite having lower index numbers. Reading order must therefore be +(re)computed by the Normalizer from final tree position, never inferred from Marker's +id numbering. `StructuralProvenance.reading_order_index` is this recomputed value, +not a copy of any number embedded in a Marker id string. + +### 3.5 Section Path + +`StructuralProvenance.section_path` is populated directly from Marker's own +`section_hierarchy` dict, which the empirical findings confirmed is already a +precomputed breadcrumb (e.g. a deeply nested `TableCell` carrying +`{'1': '/page/1/SectionHeader/1', '4': '/page/7/SectionHeader/0'}`). Two properties +of this dict are carried forward into the spec rather than assumed away: + +- **Depth keys are not contiguous small integers.** The observed keys were `'1'` + and `'4'`, not `'1'` and `'2'`, indicating these correspond to some absolute + nesting depth from Marker's internal traversal rather than a clean rank. The + Document schema therefore stores `section_path` as an **ordered list of + SectionHeader Marker-block ids** (sorted by the numeric value of their original + dict key) rather than preserving Marker's dict-with-gaps shape — this gives + downstream consumers (Retrieval layer, Section nesting) a clean, ordinary list + without forcing them to understand Marker's internal depth-key semantics. +- **The mapping is per-block, not per-Section-object.** Every Marker block — + including deeply nested ones like a `TableCell` — carries its own full path. The + Document Object's `Section` containment hierarchy (Section 9) is derived from this + same data, so `section_path` on any object and that object's ancestor `Section` + chain are guaranteed consistent by construction, not by a separate invariant check. + +--- + +## 4. Document + +**Purpose.** The root container for one processed paper. Holds the page sequence, +top-level metadata, processing metadata, and aggregate statistics. Exactly one +`Document` exists per source PDF per Normalizer run. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | `document_id`, Section 2. | +| `source_pdf_identifier` | str | yes | Architectural requirement | The stable external identifier used to compute `id` (DOI or content hash). Stored explicitly so the id's derivation is independently checkable, not just trusted. | +| `metadata` | `Metadata` | yes | Marker-observed + Architectural | Section 5. | +| `processing_metadata` | `ProcessingMetadata` | yes | Architectural requirement | Section 6. | +| `statistics` | `Statistics` | yes | Architectural requirement | Section 7. | +| `pages` | list of `Page` | yes (min length 1) | Marker-observed | Ordered by page number ascending; this ordering is also the top level of global reading order. | + +**Invariants.** +- `pages` is non-empty and sorted ascending by `Page.page_number` with no + duplicate or skipped page numbers other than what Marker itself reported (a + Marker-side page omission is preserved, not silently re-numbered). +- `Document` is the only object in this schema with no `StructuralProvenance` of + its own (there is no single Marker block representing "the whole document" — the + Raw Marker Model's root node was empirically confirmed to have no `id`, `bbox`, + or `polygon` at all). Its provenance is implicitly "the entire Raw Marker Model + file," which `processing_metadata.source_marker_artifact_ref` captures (Section + 6) rather than a `StructuralProvenance` instance. + +--- + +## 5. Metadata + +**Purpose.** Bibliographic and identification facts about the paper, to the extent +they are structurally recoverable (not semantically extracted — see the boundary +note below). + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `title` | Optional[str] | no | Marker-observed | Taken verbatim from the first/top-level `SectionHeader` or title-styled block on the front matter page, if structurally identifiable. | +| `page_count` | int | yes | Marker-observed | Count of `Page` objects; redundant with `len(pages)` but kept as an explicit field since `Statistics` (Section 7) is meant to hold *derived counts*, while this one is a basic identifying fact worth surfacing without traversing the tree. | +| `has_front_matter_page` | bool | yes | Marker-observed (heuristic) | Whether page 0 (or any page) was structurally flagged as publisher wrapper content. See `Page.is_front_matter` (Section 8) for the per-page flag this aggregates. | + +**Boundary note.** `Metadata` deliberately does **not** include authors, journal +name, publication year, or DOI as structured fields, even though these are +intuitively "metadata." Per the empirical findings (3.8), front-matter and +citation-bearing content is **structurally indistinguishable** from other text at +the block-type level — recovering "the authors" or "the journal" requires reading +and interpreting text content, which is scientific/semantic extraction, not +structural parsing. That work belongs to the IR's `Citation` entity (already +specified in the project's broader IR design), built by the extraction layer. This +spec only exposes what is mechanically true of the page structure (title block +location, page count, front-matter flag) — adding speculative `author`/`doi`/`year` +fields here would violate invariant 1.2 and the "no speculative fields" instruction, +since populating them correctly is not a structural operation. + +--- + +## 6. ProcessingMetadata + +**Purpose.** Records *how* this particular Document Object was produced, separate +from *what* it identifies (Section 4's `id`/`source_pdf_identifier`). This is what +makes reproducibility checkable: two Document Objects with the same `id` but +different `ProcessingMetadata` indicate the same paper was processed by a different +Marker or Normalizer version, which is exactly the signal needed to detect drift +without conflating it with document identity. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `marker_version` | str | yes | Marker-observed | Verbatim from Marker's own output metadata, if present; otherwise the version string of the Marker invocation recorded by the adapter. | +| `normalizer_version` | str | yes | Architectural requirement | Semantic version of the Normalizer code that produced this Document Object. Required so a future schema/logic change is always attributable. | +| `processed_at` | datetime (ISO 8601, UTC) | yes | Architectural requirement | Wall-clock time of this materialization. Explicitly **not** part of `id` computation (Section 2) — recorded for audit/debugging only. | +| `source_marker_artifact_ref` | str | yes | Architectural requirement | A path or content hash identifying the exact Raw Marker Model JSON file this Document Object was normalized from, satisfying the "Document has no own provenance" note in Section 4 by pointing at the file-level artifact instead of a block-level one. | + +**Why this is architectural rather than Marker-observed for most fields.** Only +`marker_version` comes from Marker itself; the rest exist purely because the +project's stated reproducibility requirement ("identical PDFs ... should always +produce identical Document Objects," and detectability of drift) demands a place to +record the inputs that determine reproducibility, even though Marker's own output +has no opinion on them. + +--- + +## 7. Statistics + +**Purpose.** Aggregate counts over the final Document Object tree, useful for +sanity-checking a Normalizer run (e.g. "did this paper produce zero tables when the +PDF clearly has six") without re-traversing the tree ad hoc. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `page_count` | int | yes | Architectural requirement (derived) | `len(pages)`. | +| `section_count` | int | yes | Architectural requirement (derived) | Total `Section` objects across the document. | +| `paragraph_count` | int | yes | Architectural requirement (derived) | Total `Paragraph` objects. | +| `table_count` | int | yes | Architectural requirement (derived) | Total `Table` objects. | +| `figure_count` | int | yes | Architectural requirement (derived) | Total `Figure` objects. | +| `equation_count` | int | yes | Architectural requirement (derived) | Total `Equation` objects. | +| `footnote_count` | int | yes | Architectural requirement (derived) | Total `Footnote` objects. | +| `reference_count` | int | yes | Architectural requirement (derived) | Total `Reference` objects. | +| `unresolved_footnote_count` | int | yes | Architectural requirement (derived) | Footnotes whose `attached_object_id` (Section 16) is `None` after Normalizer processing — a direct, queryable signal of how much of the geometric-attachment heuristic (empirical finding 3.2) succeeded on this paper. | + +**Why this object exists at all, given everything in it is derivable.** Every +field here is computable by traversal, so in principle `Statistics` adds no new +information. It exists as an explicit object — rather than leaving consumers to +compute it themselves — because (a) it gives a single, serializable snapshot for +logging/comparison across Normalizer runs without re-parsing the whole tree, and (b) +`unresolved_footnote_count` specifically operationalizes a concern raised directly +in the empirical findings (footnote attachment is a heuristic, not guaranteed) into +a number that can be tracked across the representative paper set as the Normalizer +is built and tuned. + +--- + +## 8. Page + +**Purpose.** One PDF page's structural content, in reading order. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2; `canonical_path = "/page/{page_number}"`. | +| `page_number` | int | yes | Marker-observed | Zero-indexed, matching Marker's own page numbering. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Marker Page block's id]`. | +| `children` | list of (`Section` \| `Paragraph` \| `Table` \| `Figure` \| `Equation` \| `Footnote` \| `PageHeader` \| `PageFooter`) | yes (may be empty) | Marker-observed | Top-level content of the page, in final reading order (Section 3.4). A discriminated union over `block_type`-equivalent kinds, mirroring (but not reusing) Marker's own children-array structure. | +| `is_front_matter` | bool | yes | Marker-observed (heuristic) | True if this page was identified as publisher wrapper content (journal cover, "Submit your article," ISSN-only content, etc.) rather than paper body. | + +**On `is_front_matter`.** Empirical finding 3.8 established that Marker gives no +structural signal distinguishing a wrapper page from a content page — both use +identical block types. This flag is therefore explicitly a **heuristic output of +the Normalizer** (content-pattern based, e.g. presence of "ISSN," "Submit your +article," near-total absence of citation-bearing text), not something copied from +Marker. The field is included now, with its value to be computed later, because the +project's stated requirement is that the schema accommodate this known case without +redesign — per the same logic as the other deferred-population fields in this spec +(Section 22 collects all of them explicitly). + +**Why a discriminated union for `children` rather than `list[Any]` or one generic +`Block` type.** The Raw Marker Model deliberately uses one uniform envelope because +it must stay agnostic to block semantics. The Document Object's job is the opposite: +it exists specifically to make structural type distinctions (a `Table` is not +interchangeable with a `Paragraph` downstream). A discriminated union gives +consumers static type safety and keeps `Page`/`Section` children lists +self-describing in serialized JSON via the discriminator field, with no loss of +ordering since list order is itself the reading-order signal (Section 3.4). + +--- + +## 9. Section + +**Purpose.** A heading-governed grouping of content, derived from Marker's +`section_hierarchy` breadcrumbs (Section 3.5) rather than re-derived from text +pattern-matching on headings. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | `canonical_path` built from the governing `SectionHeader` Marker block's own path id. | +| `heading_text` | str | yes | Marker-observed | Verbatim text of the governing `SectionHeader` block. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the SectionHeader block's id]`. | +| `depth` | int | yes | Marker-observed (derived) | Position of this section's heading in the ordered `section_path` list (Section 3.5), zero-indexed from the outermost heading on the page/document. | +| `children` | list of (`Section` \| `Paragraph` \| `Table` \| `Figure` \| `Equation` \| `Footnote`) | yes (may be empty) | Marker-observed | Nested sub-sections and content governed by this heading, in reading order. A `Section` may contain further `Section` objects, giving the hierarchy genuine nesting rather than a flat list with a depth integer alone. | + +**Invariant.** Every leaf or container object elsewhere in the schema that carries +a non-empty `section_path` in its `StructuralProvenance` must have a corresponding +ancestor chain of `Section` objects matching that path exactly — this is guaranteed +by construction (both are derived from the same `section_hierarchy` source, per +Section 3.5) rather than checked as a runtime validator, but it is stated here as a +hard design invariant the Normalizer must not violate. + +**Why `SectionHeader` blocks that are really table/figure labels (e.g. a Marker +`SectionHeader` containing only `"Table 3"`, per Pattern B) do not become `Section` +objects.** Empirical finding 3.1 showed Marker uses the same `SectionHeader` +block type both for genuine paper sections (Methods, Results) and for bare-table +caption labels. The Normalizer must distinguish these by context — a +`SectionHeader` immediately followed by a `Text` block and then a `Table`, with no +intervening structural content, is a caption label being consumed into that +`Table`'s `Caption` (Section 12), not a new `Section`. This rule is recorded here so +the schema's `Section` object is understood to represent only genuine paper +sections; the disambiguation logic itself is Normalizer business logic (out of +scope for this document) but the *consequence* — that some `SectionHeader` Marker +blocks become part of a `Caption` rather than a `Section` — is a structural decision +the schema must support, and it does: `Caption.provenance.marker_block_ids` can +include a `SectionHeader` id (Section 12). + +--- + +## 10. Paragraph + +**Purpose.** A single block of body text — the most common leaf content type. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `text` | str | yes | Marker-observed | The block's inline HTML content from Marker, **as-is** (e.g. ``, `` tags preserved). | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the originating Text/ListItem block's id]`. | + +**Why `text` keeps inline HTML rather than being plain-text-stripped.** Empirical +finding (leaf block dump) confirmed Marker leaf blocks carry real semantic inline +markup (``, ``) directly in their content, not a side annotation. Stripping +it at the Document layer would be a one-way, lossy transformation performed before +any consumer has had a chance to decide whether that markup matters (e.g. a `` +emphasis inside a Methods paragraph could matter to the extraction layer's reasoning +about emphasis on a key term). Per invariant 1.4 (maximum available provenance) and +the general "never lose information without a consumer-side decision to do so" +principle, the Document layer preserves it verbatim; any stripping is a retrieval- +or extraction-layer concern, explicitly out of scope here. + +**Note on `ListItem`/`ListGroup`.** Marker's bibliography uses `ListGroup` containers +of `ListItem` leaves rather than `Text` blocks (this was the basis for separating +references structurally without text pattern-matching). A `ListItem` that is part of +a reference list is **not** modeled as a `Paragraph` — it becomes a `Reference` +(Section 17). A `Paragraph` is reserved for body-text `Text`/generic `ListItem` +content; the Normalizer disambiguates by parent context (a `ListGroup` under the +References section vs. elsewhere), again business logic out of scope here, but the +schema accommodates the distinct outcome via two separate object types. + +--- + +## 11. Caption (supporting type, embedded in Table and Figure) + +**Purpose.** A normalized representation of a table or figure's caption, +collapsing Marker's two empirically observed patterns into one shape. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `label` | Optional[str] | no | Marker-observed | E.g. `"Table 3"` or `"Figure 1"`. Present whenever a `Caption` block (Pattern A) or a `SectionHeader` label block (Pattern B) was found. | +| `text` | Optional[str] | no | Marker-observed | The descriptive caption sentence. From the `Caption` block's content (Pattern A) or the `Text` block immediately following the label (Pattern B). | +| `trailing_notes` | Optional[str] | no | Marker-observed | The trailing "Note: ..." `Text` block sometimes observed immediately after a `Table`, distinct from both `label`/`text` and from `Footnote` objects (Section 16). Kept as its own field because it was empirically observed to be part of the caption apparatus, not body text, but also not a true Marker `Footnote` block. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids` lists every contributing Marker block (one for Pattern A's single `Caption` block; two or three for Pattern B's `SectionHeader` + `Text` + optional trailing `Text`). Uses `contributing_bboxes` (Section 3.3) rather than a single `bbox` whenever more than one block contributes, since collapsing non-adjacent regions into one bbox would misrepresent the geometry. | + +**Why one normalized shape rather than preserving Marker's two patterns +separately in the schema.** This is the central case the project's "empirically +driven, not speculative" instruction is built around: both patterns were directly +observed (Pattern A on pages 6/10/12, Pattern B on page 7, per finding 3.1), so +normalizing them is not a hypothetical convenience — it is required because every +downstream consumer (extraction layer asking "what is this table about," review UI +displaying "the caption") needs one consistent shape regardless of which pattern the +source PDF happened to produce. Modeling them as two different optional sub-objects +instead would push that disambiguation work onto every consumer rather than once, +inside the Normalizer, where the empirical knowledge of the two patterns actually +lives. + +--- + +## 12. Table + +**Purpose.** A table's logical structure and its evidence-level cell geometry, +kept as two deliberately parallel representations per the empirical recommendation. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Table block's id]` (and the `TableGroup` id too, if Pattern A). | +| `caption` | Optional[`Caption`] | no | Marker-observed | Section 11. `None` only if no caption-bearing blocks were found adjacent to the table at all (not empirically observed in the representative paper, but not excluded as a possibility — captionless tables are not assumed impossible). | +| `raw_html` | str | yes | Marker-observed | The Table block's own `html` field, verbatim — the complete, correctly-nested `...
` Marker produces. Treated as the **source of truth for logical structure** (rows, columns, header rows), per the empirical recommendation, precisely because reconstructing structure independently from cell geometry risks disagreeing with Marker's own (already correct) parse. | +| `rows` | list of `TableRow` | yes (may be empty) | Marker-observed (derived) | A structured parse of `raw_html`'s `` elements into row objects (Section 12.1), giving consumers row/column access without re-parsing HTML themselves. Derived from `raw_html`, not an independent reconstruction. | +| `cells` | list of `TableCell` | yes (may be empty) | Marker-observed | The flat list of Marker `TableCell` child blocks, retained **only** as evidence/geometry data (bbox, polygon, provenance) — explicitly not used to derive row/column structure, per the empirical recommendation that `raw_html` is structural truth and `TableCell` geometry is supplementary. | +| `footnote_ids` | list of str | yes (may be empty) | Architectural requirement (deferred population) | Ids of `Footnote` objects geometrically attached to this table (Section 16). Empty until the Normalizer's bbox-proximity heuristic (finding 3.2) runs; the field exists now so that heuristic's output has a defined home without later schema change. | + +### 12.1 TableRow (supporting type, embedded in `Table.rows`) + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `cells` | list of `TableRowCell` | yes | Marker-observed (derived) | Ordered left to right per the source ``. | + +### 12.2 TableRowCell (supporting type, embedded in `TableRow.cells`) + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `text` | str | yes | Marker-observed | Cell text content from the parsed ``/`` element, with any `` wrapper tag stripped and its content treated as equivalent plain text (per empirical finding 3.5 — Marker inconsistently wraps numerically identical `mean ± stderr` values in `` depending on OCR path; the schema does not preserve this distinction since it carries no structural meaning, only an OCR-routing artifact). | +| `is_header` | bool | yes | Marker-observed | True if the source element was ``, false for ``. | +| `structural_notes` | Optional[str] | no | Architectural requirement (deferred) | A free-text slot reserved for a Normalizer-attached structural annotation — most notably, a suspected merged-cell placeholder (empirical finding 3.4: Marker silently flattens merged header cells into duplicated rows with an empty filler cell, with no flag distinguishing this from a genuinely empty cell). The heuristic for populating this field is explicitly **not** decided in this specification — finding 3.4 was flagged as needing more representative papers before a reconstruction rule is chosen. The field is included as an open slot precisely so that decision can be made later without a schema change, consistent with the brief's instruction to accommodate known structural cases without redesign. | + +**Why `TableRowCell` does not have `row_index`/`col_index` integers.** These are +implicit in `Table.rows`' list-of-lists structure itself (a cell's row is its +containing `TableRow`'s position in `rows`; its column is its own position in +`cells`), so adding redundant integer fields would duplicate information already +present in list order, with no Marker-observed justification for storing it twice. + +**Why `TableRowCell` has no `rowspan`/`colspan` field.** Empirical finding 3.4 +confirmed Marker's HTML output never emits `rowspan`/`colspan` attributes even where +the source PDF visually has merged cells — it flattens instead. Adding a +`rowspan`/`colspan` field for a case never observed in Marker's actual output would +violate the "no speculative fields" instruction. If a future representative paper +demonstrates Marker does emit span attributes under some condition, this is the +single place such fields would be added. + +### 12.3 TableCell (supporting type, embedded in `Table.cells`; evidence-only) + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2; path derived from the Marker `TableCell` block's own id. | +| `text` | str | yes | Marker-observed | Verbatim cell content (not math-stripped here — this object is evidence/geometry, not the logical text consumers should read; `TableRowCell.text` is the cleaned version). | +| `bbox` | `BoundingBox` | yes | Marker-observed | Per-cell geometry, the entire reason this parallel representation is retained (evidence highlighting in the review UI). | +| `polygon` | `Polygon` | yes | Marker-observed | Mirrors `bbox`. | + +**Why this evidence-only `TableCell` and the logical `TableRowCell` are not unified +into one type.** Empirical finding 3.3 established these are genuinely two +different, only partially-corresponding representations Marker provides in +parallel — one (the `` HTML) has correct logical structure but no per-cell +geometry, the other (`TableCell` children) has per-cell geometry but no row/column +index. Forcing them into a single type would require either fabricating row/column +indices on the geometry side (an unverified bbox-clustering reconstruction the +findings explicitly flagged as risky) or discarding per-cell geometry on the logical +side (losing the evidence-highlighting capability entirely). Keeping them separate, +each true to what Marker actually provides, is the choice that adds no +unverified inference. **Open implementation note, not a schema decision:** +positionally correlating a given `TableRowCell` with its corresponding `TableCell` +(for evidence highlighting of a specific logical cell) is left to the Normalizer to +attempt via parse-order correspondence; this spec does not assert that +correspondence is guaranteed, since it was not empirically verified. + +--- + +## 13. Figure + +**Purpose.** A figure region, with its caption normalized the same way as `Table`. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Figure block's id]` (and `FigureGroup` id, if present). | +| `caption` | Optional[`Caption`] | no | Marker-observed | Section 11. Empirically, `FigureGroup` always pairs `[Figure, Caption]` in that order (mirroring `TableGroup`'s pairing, just with reversed order — confirmed, not assumed, per the findings doc). | +| `image_data` | Optional[bytes] | no | Marker-observed | Base64-decoded raster image content from Marker's `images` field, when present. Empirically, in the representative paper, only `Picture` blocks (journal logo, cover thumbnail) carried non-empty `images`; the one `Figure` block had `images: {}`. This field is therefore included (figures plausibly can carry raster data, and the project must not assume they never will) but its emptiness in the current evidence base is recorded explicitly in Section 22 as unconfirmed, not silently assumed resolved. | + +--- + +## 14. Equation + +**Purpose.** A mathematical expression block. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Equation block's id]`. | +| `raw_math` | str | yes | Marker-observed | The verbatim MathML-ish `` content, including any equation number embedded inline (e.g. `"DP = I - IR + P - ETc \pm VR, \qquad (1)"`), per empirical finding 3.7. | +| `equation_number` | Optional[str] | no | Architectural requirement (deferred) | A slot for the parsed-out equation number (e.g. `"1"`), since finding 3.7 confirmed Marker provides no separate field for it — any cross-reference resolution ("using equation (1)" in body text) requires parsing it out of `raw_math`. The parsing logic itself is out of scope for this spec; the field exists so its result has a defined home. | + +--- + +## 15. Footnote + +**Purpose.** A footnote block, with its attachment to a table or figure resolved +geometrically rather than structurally, per empirical finding 3.2. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Footnote block's id]`. Note: this block's own provenance never implies attachment — footnotes are flat page-level siblings, not children of any Table/Figure (finding 3.2), so attachment is recorded separately below. | +| `raw_text` | str | yes | Marker-observed | Verbatim footnote content. | +| `attached_object_id` | Optional[str] | no | Architectural requirement (deferred) | The id of the `Table` or `Figure` this footnote was determined to belong to, via the Normalizer's bbox-proximity heuristic ("nearest preceding Table/Figure on the same page by bbox y-position," per finding 3.2). `None` when unresolved — tracked in aggregate by `Statistics.unresolved_footnote_count` (Section 7). The heuristic itself is Normalizer logic, out of scope here; the field exists so its output, including the legitimate possibility of non-resolution, has a defined, queryable home. | + +**Why attachment is nullable rather than required.** Forcing every footnote to +resolve to a table/figure would hide genuine ambiguity (e.g. a footnote whose +geometric position is equidistant between two candidates, or a page-level +disclaimer footnote unrelated to any table) behind an incorrect best-guess. Per the +project's broader principle (already established for the IR: "fields that cannot be +resolved are marked with an unresolved status rather than silently filled"), the +same discipline applies at the structural layer: `None` is a legitimate, recorded +outcome, not an implementation gap to paper over. + +--- + +## 16. Reference (Bibliography Entry) + +**Purpose.** One bibliography entry, structurally distinguished from body text +because Marker represents references via `ListGroup`/`ListItem`, not `Text` blocks +(observed directly in the page_stats/structure walkthrough, not inferred from +content). + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the ListItem block's id]`. | +| `raw_text` | str | yes | Marker-observed | Verbatim reference entry text, including any inline markup Marker preserved. | + +**Boundary note.** Like `Metadata` (Section 5), `Reference` deliberately stops at +verbatim text. Parsing a reference string into author/year/journal/DOI fields is +citation-matching — a semantic operation belonging to the IR's `Citation` entity, +not this layer. This object's only job is to say "this `ListItem`, structurally, +is a bibliography entry, not body text," which is information Marker's block typing +already gives for free via the `ListGroup` container. + +--- + +## 17. PageHeader / PageFooter + +**Purpose.** Repeated journal running-header/footer content (e.g. the journal name +repeated on every page, or page-footer branding), retained for completeness and +front-matter heuristics (Section 8) but not expected to be consumed by extraction. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the PageHeader/PageFooter block's id]`. | +| `raw_text` | str | yes | Marker-observed | Verbatim content. | + +Modeled as two distinct types (`PageHeader`, `PageFooter`) rather than one generic +"running content" type, simply mirroring Marker's own distinct block types +one-to-one — there is no structural reason to merge them, and merging would lose +the type distinction Marker itself already makes. + +--- + +## 18. Cross-Object Relationships & Invariants Summary + +This section consolidates relationship rules stated piecemeal above, for a single +point of reference. + +- **Containment is exclusively via `children` lists and explicit id-reference + fields (`footnote_ids`, `attached_object_id`) — never via implicit ordering + conventions or id-string parsing.** Any consumer needing "what footnotes belong + to this table" reads `Table.footnote_ids`, never re-derives it from geometry + itself; the Normalizer computes that relationship exactly once. +- **Reading order is a single global property (Section 3.4), independent of any + per-object containment.** Two sibling objects under different `Section`s can be + compared for relative reading order via their `reading_order_index` without + needing to know anything about section nesting. +- **Every non-`Document` object carries exactly one `StructuralProvenance`,** which + is the only place Marker block ids appear outside of `ProcessingMetadata`'s + artifact reference. No object duplicates Marker ids elsewhere in its own fields. +- **No object type defined in this specification has a field referencing an IR, + retrieval, validation, or export concept**, satisfying invariant 1.6 by + construction — this is checked by inspection of this document, not by a runtime + rule, since it is a closed schema with a fixed object list. + +--- + +## 19. Serialization Requirements + +- All models use `model_config = ConfigDict(frozen=True, extra="forbid")`. Unlike + the Raw Marker Model (which intentionally used `extra="allow"` for forward + compatibility with unknown future Marker fields), the Document Object is the project's own + designed contract — an unexpected extra field here indicates a Normalizer bug, + not a benign future Marker addition, so it should fail loudly (`extra="forbid"`) + rather than silently passing through. +- `model_dump_json()` must be deterministic for a given object graph: field order + follows declaration order (Pydantic v2 default), list order follows the + semantically meaningful order already specified per field (reading order for + children, left-to-right for table cells, outermost-to-innermost for + `section_path`) — never a non-deterministic order like dict-hash order. + `datetime` fields serialize as ISO 8601 strings in UTC. +- Every model must round-trip losslessly through `model_dump()` → + `model_validate()` and `model_dump_json()` → `model_validate_json()`, mirroring + the test discipline already established and passing for the Raw Marker Model. +- Bytes fields (`Figure.image_data`) serialize as base64 strings in JSON, matching + Marker's own convention for `images`, so no separate encoding scheme is + introduced at this layer. + +--- + +## 20. Validation Rules + +These are construction-time invariants enforced by each model's own validators, +distinct from the cross-cutting invariants in Section 1 (which are policies the +Normalizer must follow, not all individually mechanically checkable). + +- `BoundingBox`: `x1 >= x0` and `y1 >= y0`. +- `StructuralProvenance`: `marker_block_ids` has at least one element; + `reading_order_index >= 0`; exactly one of `bbox` or `contributing_bboxes` (or + neither, for objects with genuinely no recoverable geometry) is populated — never + both, to avoid two disagreeing geometric claims about the same object. +- `Document`: `pages` non-empty; `page_number` values across `pages` are unique. +- `Page`: `page_number >= 0`. +- `Table`: if `rows` is non-empty, every `TableRow.cells` list has at least one + element (a row with zero cells is not a meaningful row — such input indicates a + parse error in `raw_html`, which should surface as a Normalizer-time error, not a + silently-accepted empty row in the Document Object). +- `Footnote`: no validation forces `attached_object_id` to be set — its absence is + valid by design (Section 15). +- `Statistics`: every count field is `>= 0`; `unresolved_footnote_count <= + footnote_count` (a basic sanity bound the model itself can check independent of + whatever produced the numbers). + +--- + +## 21. Explicitly Deferred — Not Modeled, By Design + +Per the instruction to avoid speculative fields, the following structural cases +identified during evaluation are **intentionally absent** from this schema rather +than represented with a guessed-at field shape, because the representative paper +set does not yet provide enough evidence to know what shape is correct: + +- **Multi-page table continuation.** Not observed in the representative paper (no + table spans a page break). No `continues_on_page` / `continuation_of_table_id` + field is added speculatively. When a representative paper exhibiting this is + evaluated, this section is where such a field would be added — as an addition, + not a redesign, since `Table` already has a stable `id` to reference. +- **Multi-panel figure decomposition.** The representative paper's Figure 3 has 10 + visually lettered sub-panels under one shared caption, but Marker recorded it as + a single flat `Figure` block with no internal panel structure. Since this is the + only data point (n=1) and it shows Marker *not* decomposing panels, no `panels: + list[FigurePanel]` field is added on the strength of a PDF-visual observation that + contradicts what Marker itself outputs. If a future paper shows Marker does + sometimes decompose panels, this is where that field would be introduced. +- **TableGroup-vs-bare-Table triggering condition.** Both patterns are modeled + (via `Caption`'s flexible provenance, Section 11), but *why* Marker chooses one + over the other (single table per region vs. dense multi-table page, per the one + data point available) is not encoded as a schema concept — it doesn't need to be, + since the Document Object normalizes both outcomes into the same `Caption` shape + regardless of cause. +- **Merged-cell reconstruction heuristic.** The *slot* (`TableRowCell.structural_ + notes`) exists (Section 12.2), but the specific rule for populating it (e.g. + "empty cell directly below a filled cell in the same column ⇒ merged-placeholder + suspected") is explicitly not decided here, per finding 3.4's own conclusion that + this needs more representative papers first. + +--- + +## 22. Summary — Field Origin Distribution + +A consolidated view of the distinction requested for this specification: how many +fields per object are direct Marker carry-overs versus existing purely to satisfy +an architectural requirement (provenance, determinism, reproducibility, +serialization) versus reserved as a deferred-population slot for a Normalizer +heuristic not yet designed. + +| Object | Marker-observed fields | Architectural-requirement fields | Deferred-population slots | +|---|---|---|---| +| Document | `pages` | `id`, `source_pdf_identifier`, `metadata`, `processing_metadata`, `statistics` | — | +| Metadata | `title`, `page_count`, `has_front_matter_page` | — | — | +| ProcessingMetadata | `marker_version` | `normalizer_version`, `processed_at`, `source_marker_artifact_ref` | — | +| Statistics | — | all count fields | — | +| Page | `page_number`, `children`, `is_front_matter` (heuristic) | `id`, `provenance` | — | +| Section | `heading_text`, `depth`, `children` | `id`, `provenance` | — | +| Paragraph | `text` | `id`, `provenance` | — | +| Caption | `label`, `text`, `trailing_notes` | `provenance` | — | +| Table | `raw_html`, `rows`, `cells`, `caption` | `id`, `provenance` | `footnote_ids` | +| TableRowCell | `text`, `is_header` | — | `structural_notes` | +| TableCell | `text`, `bbox`, `polygon` | `id` | — | +| Figure | `caption`, `image_data` | `id`, `provenance` | — | +| Equation | `raw_math` | `id`, `provenance` | `equation_number` | +| Footnote | `raw_text` | `id`, `provenance` | `attached_object_id` | +| Reference | `raw_text` | `id`, `provenance` | — | +| PageHeader/PageFooter | `raw_text` | `id`, `provenance` | — | + +--- + +## 23. Exit Criteria for This Specification + +This specification is ready to be frozen and handed to implementation once: + +1. Every object above has a 1:1 or many:1 mapping back to either an observed + Marker block type or a named architectural requirement (satisfied throughout + this document via the "Origin" column on every field table). +2. No field exists whose justification is "might be useful later" rather than + "Marker provides this" or "the architecture requires this for provenance / + determinism / reproducibility / serialization / validation" (satisfied; the one + category that looks speculative — deferred-population slots — is explicitly + justified by the stated requirement to avoid future schema redesign, and is + listed exhaustively in Section 22's third column plus Section 21's explicit + exclusions). +3. No object or field encodes scientific meaning (satisfied — verified against + invariant 1.2 by inspection of the full object list in Sections 4–17). +4. Implementing this specification in Pydantic v2 requires translation, not new + design decisions — the identifier rule (Section 2), provenance rule (Section + 3.3), reading-order rule (Section 3.4), and every per-object field table are + concrete enough to type directly. + +Once reviewed and approved, the next phase is the mechanical translation of this +document into immutable Pydantic v2 models, followed by the Normalizer that +populates them from the Raw Marker Model. diff --git a/docs/document_schema_specification_v1.0.md b/docs/document_schema_specification_v1.0.md new file mode 100644 index 0000000..cb12078 --- /dev/null +++ b/docs/document_schema_specification_v1.0.md @@ -0,0 +1,788 @@ +# Document Schema Specification + +**Project:** LLM-Assisted Extraction of Agronomic and Ecological Experiments into Structured Data +**Layer:** Document Understanding Layer (Document Object only) +**Status:** Draft for review — intended to be frozen upon approval +**Predecessor artifact:** Raw Marker Model (`marker_adapter/raw_model.py`), frozen +**Empirical basis:** `Marker Output — Empirical Findings (Paper 1: Nutrient Cycling, Smukler et al. 2012)` + +This document is an engineering specification, not an implementation. It defines every +object in the canonical Document layer — its purpose, fields, types, relationships, +identifier rule, provenance rule, validation rules, and serialization requirements — +so that implementing the Pydantic v2 Document Object later requires translation, not +design. No parsing, normalization, or business logic is described here; only the shape +of the data those future components will populate. + +--- + +## 0. Relationship to the Raw Marker Model + +The Raw Marker Model is a lossless, uninterpreted mirror of Marker's JSON output: a +single `MarkerBlock` envelope type, `extra="allow"`, no discriminated union, no +semantic interpretation. The Document Object described here is the next layer down +the pipeline: it is produced *from* the Raw Marker Model by the Normalizer (not yet +implemented) and is the first place where **structural** interpretation occurs — +deciding what counts as a Table, a Section, a Footnote's likely attachment — while +still containing zero scientific meaning. + +Every Document-layer object therefore exists in addition to, not instead of, the Raw +Marker Model. The Raw Marker Model remains the permanent ground truth on disk; +the Document Object is a derived, queryable, typed structural view over it. This +specification assumes the Raw Marker Model is available as an immutable input and +focuses entirely on what the Normalizer must produce from it. + +--- + +## 1. Architectural Invariants + +These rules are not per-object — they govern the entire Document layer and every +object defined below conforms to them without restating them per-section. + +**1.1 Immutability.** Every Document-layer model is frozen after construction +(Pydantic v2 `model_config = ConfigDict(frozen=True)`). No object is mutated after +the Normalizer finishes building it. Corrections, re-interpretation, or review +happen in later layers (IR, Scientist Review) and never write back into the +Document Object. + +**1.2 Structural-only content.** No Document-layer object may contain a field whose +purpose is to record scientific meaning. Concretely: no `treatment`, `species`, +`observation`, `management_event`, `variable`, or `trait` field exists anywhere in +this schema, even as an optional placeholder. If a future need arises to record such +a concept, it belongs in the IR, which is built on top of — never inside — the +Document Object. + +**1.3 Deterministic identifiers.** No identifier in this schema is a UUID4 or any +other non-deterministic value. Every identifier is a pure function of stable inputs, +so that re-running the same PDF through the same Marker version and the same +Normalizer version always yields byte-identical identifiers. The exact construction +rule is given in Section 2. + +**1.4 Maximum available provenance.** Every object that originates from one or more +Marker blocks retains a `StructuralProvenance` value (Section 3.3) referencing the +originating Marker block id(s), page number, bounding box, polygon, and reading-order +position. An object is never permitted to "lose" its Marker origin even when the +Normalizer reshapes or merges multiple Marker blocks into one Document object (e.g. +turning a `SectionHeader` + `Text` pair into one normalized `Caption`). + +**1.5 Deterministic, lossless serialization.** Every model in this schema must +support `model_dump()` / `model_dump_json()` and round-trip back through +`model_validate()` without information loss, exactly as already verified for the +Raw Marker Model. Field ordering in dumped JSON is determined by declaration order +in the Pydantic model (not insertion order at runtime) to keep serialized output +byte-stable across runs. + +**1.6 Independence from downstream layers.** Nothing in this schema imports from, +references, or anticipates retrieval, LLM extraction, validation, the IR, or BETYdb +export. The Document Object's public surface is consumed by those layers, but this +schema has zero knowledge of them. + +--- + +## 2. Identifier Strategy + +**Rule.** Every Document-layer object's `id` is computed as: + +``` +id = "doc:" + sha256( document_id + "|" + canonical_path )[:16] +``` + +where `document_id` is the parent Document's own id (Section 4), and +`canonical_path` is a deterministic structural path string specific to each object +type, defined per-object below (generally derived from the originating Marker +block's own path-like id, e.g. `/page/7/Table/2`, when one exists 1:1; or, for objects +synthesized from multiple Marker blocks or with no direct Marker counterpart — such +as a parsed `TableRow` — a path built from the parent object's id plus an ordinal +position among deterministically-ordered siblings, e.g. `.../Table/2/row/3`). + +**Why a hash rather than reusing Marker's path id directly.** Marker's own ids +(`/page/7/Table/2`) are positional/index-based — `Table/2` means "third +Table-typed block encountered in that page's traversal." If a future Marker version +changes internal traversal order, encounters a new block type, or reorders block +discovery, these indices could silently shift between runs on an unchanged PDF, +producing different "stable" ids for the same content. Hashing a path that is +itself still derived from Marker's structure, combined with the document id, +preserves determinism for a fixed Marker/adapter version while making the contract +explicit: **stability is guaranteed within one Marker version, not promised across +Marker upgrades.** The original Marker path is never discarded — it is preserved +verbatim inside every object's `StructuralProvenance.marker_block_id` — so a Marker +version bump that changes traversal order is detectable (ids change) and +diagnosable (provenance still shows the old vs. new Marker ids). + +**document_id construction.** `document_id = "betydoc:" + sha256(source_pdf_identifier)[:16]`, +where `source_pdf_identifier` is a stable external identifier for the source PDF +(DOI if known, else a content hash of the source PDF bytes). This deliberately +excludes Marker version and timestamp from the identity computation: the same PDF +must always resolve to the same `document_id` so that re-processing (e.g. after a +Normalizer bug fix) updates the same logical Document Object rather than minting an +unrelated one. Marker version and processing time are recorded as **metadata about +the materialization**, not folded into identity — see `ProcessingMetadata` (Section +6). + +**Properties guaranteed by this scheme:** +- Same PDF + same Marker version + same Normalizer version ⇒ identical ids + throughout the tree. +- Ids are opaque strings, safe to use as dictionary keys, filenames, or database + foreign keys. +- Every id is traceable backward to a concrete Marker block via + `StructuralProvenance`, satisfying invariant 1.4. + +--- + +## 3. Foundational Supporting Types + +These are not top-level entities; they are embedded value objects used throughout +the schema. + +### 3.1 BoundingBox + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `x0` | float | yes | Marker-observed | Left edge | +| `y0` | float | yes | Marker-observed | Top edge | +| `x1` | float | yes | Marker-observed | Right edge | +| `y1` | float | yes | Marker-observed | Bottom edge | + +Directly carried over from Marker's `bbox` (already typed as `MarkerBBox` in the Raw +Marker Model). Retained at the Document layer because footnote-to-table attachment, +evidence highlighting in the Scientist Review UI, and any future geometric +reconstruction (e.g. merged-cell heuristics) all require it. **Invariant:** `x1 >= +x0` and `y1 >= y0`; the Normalizer is responsible for not constructing a violating +instance, but the model also validates this on construction since the cost of +allowing silently-inverted boxes downstream is high (evidence UI would render +boxes wrong with no error signal). + +### 3.2 Polygon + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `points` | list of 4 `(float, float)` pairs | yes | Marker-observed | Carried over from Marker's `polygon` | + +Retained even though `BoundingBox` is derivable from it, because Marker provides +both independently and the polygon can in principle capture skew that an +axis-aligned bbox cannot. This is a direct empirical carry-over (already present and +typed in the Raw Marker Model) rather than a new design — Document-layer objects +simply forward it unchanged. No Document-layer object computes one from the other; +both come from Marker as-is. + +### 3.3 StructuralProvenance + +This is the single most important supporting type in the schema — it is what +satisfies invariant 1.4 for every object below. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `marker_block_ids` | list of str | yes (min length 1) | Marker-observed | The originating Marker block id(s), e.g. `["/page/7/Table/2"]`. A list, not a single value, because some Document objects (e.g. a normalized `Caption` under Pattern B) are synthesized from more than one Marker block. | +| `page_number` | int | yes | Marker-observed | The PDF page this object originates from. For objects spanning conceptually across the synthesis of multiple Marker blocks, this is the page of the primary/first contributing block. | +| `bbox` | `BoundingBox` | no | Marker-observed | Present for any object with a single, well-defined originating region. Absent for objects synthesized from multiple non-adjacent blocks where a single bbox would be misleading (e.g. a Pattern-B caption combining a `SectionHeader` far above a trailing `Text` note) — in that case `contributing_bboxes` is populated instead. | +| `contributing_bboxes` | list of `BoundingBox` | no | Marker-observed | Used instead of (or in addition to) `bbox` when more than one Marker block contributes geometry, preserving each one rather than collapsing them into a single misleading box. | +| `polygon` | `Polygon` | no | Marker-observed | Mirrors `bbox`'s optionality logic. | +| `reading_order_index` | int | yes | Architectural requirement | The object's position in the document's global linear reading order (Section 3.4). Required on every provenance instance because every structural object has a place in reading order even if its bbox is ambiguous. | +| `section_path` | list of str | yes (may be empty) | Marker-observed (derived) | The chain of governing `SectionHeader` Marker-block ids from Marker's own `section_hierarchy` map, ordered outermost to innermost. Empty only for objects outside any section (e.g. a journal wrapper page's `Picture`). | + +**Why a list of Marker block ids rather than exactly one.** Empirically, not every +Document-layer concept maps 1:1 to a Marker block. The clearest case is `Table` +captions: under Pattern A (`TableGroup`), the caption is one `Caption` block; under +Pattern B (bare `Table`), the equivalent information is split across a +`SectionHeader` block and a `Text` block, sometimes with a trailing "Note:" `Text` +block. Forcing a single-id provenance field would require silently picking one +contributing block and losing the others. A list preserves all of them, satisfying +invariant 1.4 even when normalization merges several Marker blocks into one +Document concept. + +### 3.4 Reading Order + +Reading order is **not** a field on a supporting type — it is a global integer +sequence assigned by the Normalizer to every leaf and container object during +construction, equal to that object's position in a single depth-first traversal of +the final Document Object tree, in `children` array order. + +This decision is made explicitly here because it was a confirmed empirical finding, +not a default assumption: Marker's own block id local-index numbers (e.g. the +trailing `4` in `/page/7/Footnote/4`) are **not** monotonic with true reading order — +`Footnote/4` and `Footnote/5` physically appear, in the actual children array, after +`Table/8`, despite having lower index numbers. Reading order must therefore be +(re)computed by the Normalizer from final tree position, never inferred from Marker's +id numbering. `StructuralProvenance.reading_order_index` is this recomputed value, +not a copy of any number embedded in a Marker id string. + +### 3.5 Section Path + +`StructuralProvenance.section_path` is populated directly from Marker's own +`section_hierarchy` dict, which the empirical findings confirmed is already a +precomputed breadcrumb (e.g. a deeply nested `TableCell` carrying +`{'1': '/page/1/SectionHeader/1', '4': '/page/7/SectionHeader/0'}`). Two properties +of this dict are carried forward into the spec rather than assumed away: + +- **Depth keys are not contiguous small integers.** The observed keys were `'1'` + and `'4'`, not `'1'` and `'2'`, indicating these correspond to some absolute + nesting depth from Marker's internal traversal rather than a clean rank. The + Document schema therefore stores `section_path` as an **ordered list of + SectionHeader Marker-block ids** (sorted by the numeric value of their original + dict key) rather than preserving Marker's dict-with-gaps shape — this gives + downstream consumers (Retrieval layer, Section nesting) a clean, ordinary list + without forcing them to understand Marker's internal depth-key semantics. +- **The mapping is per-block, not per-Section-object.** Every Marker block — + including deeply nested ones like a `TableCell` — carries its own full path. The + Document Object's `Section` containment hierarchy (Section 9) is derived from this + same data, so `section_path` on any object and that object's ancestor `Section` + chain are guaranteed consistent by construction, not by a separate invariant check. + +--- + +## 4. Document + +**Purpose.** The root container for one processed paper. Holds the page sequence, +top-level metadata, processing metadata, and aggregate statistics. Exactly one +`Document` exists per source PDF per Normalizer run. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | `document_id`, Section 2. | +| `source_pdf_identifier` | str | yes | Architectural requirement | The stable external identifier used to compute `id` (DOI or content hash). Stored explicitly so the id's derivation is independently checkable, not just trusted. | +| `metadata` | `Metadata` | yes | Marker-observed + Architectural | Section 5. | +| `processing_metadata` | `ProcessingMetadata` | yes | Architectural requirement | Section 6. | +| `statistics` | `Statistics` | yes | Architectural requirement | Section 7. | +| `pages` | list of `Page` | yes (min length 1) | Marker-observed | Ordered by page number ascending; this ordering is also the top level of global reading order. | + +**Invariants.** +- `pages` is non-empty and sorted ascending by `Page.page_number` with no + duplicate or skipped page numbers other than what Marker itself reported (a + Marker-side page omission is preserved, not silently re-numbered). +- `Document` is the only object in this schema with no `StructuralProvenance` of + its own (there is no single Marker block representing "the whole document" — the + Raw Marker Model's root node was empirically confirmed to have no `id`, `bbox`, + or `polygon` at all). Its provenance is implicitly "the entire Raw Marker Model + file," which `processing_metadata.source_marker_artifact_ref` captures (Section + 6) rather than a `StructuralProvenance` instance. + +--- + +## 5. Metadata + +**Purpose.** Bibliographic and identification facts about the paper, to the extent +they are structurally recoverable (not semantically extracted — see the boundary +note below). + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `title` | Optional[str] | no | Marker-observed | Taken verbatim from the first/top-level `SectionHeader` or title-styled block on the front matter page, if structurally identifiable. | +| `page_count` | int | yes | Marker-observed | Count of `Page` objects; redundant with `len(pages)` but kept as an explicit field since `Statistics` (Section 7) is meant to hold *derived counts*, while this one is a basic identifying fact worth surfacing without traversing the tree. | +| `has_front_matter_page` | bool | yes | Marker-observed (heuristic) | Whether page 0 (or any page) was structurally flagged as publisher wrapper content. See `Page.is_front_matter` (Section 8) for the per-page flag this aggregates. | + +**Boundary note.** `Metadata` deliberately does **not** include authors, journal +name, publication year, or DOI as structured fields, even though these are +intuitively "metadata." Per the empirical findings (3.8), front-matter and +citation-bearing content is **structurally indistinguishable** from other text at +the block-type level — recovering "the authors" or "the journal" requires reading +and interpreting text content, which is scientific/semantic extraction, not +structural parsing. That work belongs to the IR's `Citation` entity (already +specified in the project's broader IR design), built by the extraction layer. This +spec only exposes what is mechanically true of the page structure (title block +location, page count, front-matter flag) — adding speculative `author`/`doi`/`year` +fields here would violate invariant 1.2 and the "no speculative fields" instruction, +since populating them correctly is not a structural operation. + +--- + +## 6. ProcessingMetadata + +**Purpose.** Records *how* this particular Document Object was produced, separate +from *what* it identifies (Section 4's `id`/`source_pdf_identifier`). This is what +makes reproducibility checkable: two Document Objects with the same `id` but +different `ProcessingMetadata` indicate the same paper was processed by a different +Marker or Normalizer version, which is exactly the signal needed to detect drift +without conflating it with document identity. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `marker_version` | str | yes | Marker-observed | Verbatim from Marker's own output metadata, if present; otherwise the version string of the Marker invocation recorded by the adapter. | +| `normalizer_version` | str | yes | Architectural requirement | Semantic version of the Normalizer code that produced this Document Object. Required so a future schema/logic change is always attributable. | +| `processed_at` | datetime (ISO 8601, UTC) | yes | Architectural requirement | Wall-clock time of this materialization. Explicitly **not** part of `id` computation (Section 2) — recorded for audit/debugging only. | +| `source_marker_artifact_ref` | str | yes | Architectural requirement | A path or content hash identifying the exact Raw Marker Model JSON file this Document Object was normalized from, satisfying the "Document has no own provenance" note in Section 4 by pointing at the file-level artifact instead of a block-level one. | + +**Why this is architectural rather than Marker-observed for most fields.** Only +`marker_version` comes from Marker itself; the rest exist purely because the +project's stated reproducibility requirement ("identical PDFs ... should always +produce identical Document Objects," and detectability of drift) demands a place to +record the inputs that determine reproducibility, even though Marker's own output +has no opinion on them. + +--- + +## 7. Statistics + +**Purpose.** Aggregate counts over the final Document Object tree, useful for +sanity-checking a Normalizer run (e.g. "did this paper produce zero tables when the +PDF clearly has six") without re-traversing the tree ad hoc. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `page_count` | int | yes | Architectural requirement (derived) | `len(pages)`. | +| `section_count` | int | yes | Architectural requirement (derived) | Total `Section` objects across the document. | +| `paragraph_count` | int | yes | Architectural requirement (derived) | Total `Paragraph` objects. | +| `table_count` | int | yes | Architectural requirement (derived) | Total `Table` objects. | +| `figure_count` | int | yes | Architectural requirement (derived) | Total `Figure` objects. | +| `equation_count` | int | yes | Architectural requirement (derived) | Total `Equation` objects. | +| `footnote_count` | int | yes | Architectural requirement (derived) | Total `Footnote` objects. | +| `reference_count` | int | yes | Architectural requirement (derived) | Total `Reference` objects. | +| `unresolved_footnote_count` | int | yes | Architectural requirement (derived) | Footnotes whose `attached_object_id` (Section 16) is `None` after Normalizer processing — a direct, queryable signal of how much of the geometric-attachment heuristic (empirical finding 3.2) succeeded on this paper. | + +**Why this object exists at all, given everything in it is derivable.** Every +field here is computable by traversal, so in principle `Statistics` adds no new +information. It exists as an explicit object — rather than leaving consumers to +compute it themselves — because (a) it gives a single, serializable snapshot for +logging/comparison across Normalizer runs without re-parsing the whole tree, and (b) +`unresolved_footnote_count` specifically operationalizes a concern raised directly +in the empirical findings (footnote attachment is a heuristic, not guaranteed) into +a number that can be tracked across the representative paper set as the Normalizer +is built and tuned. + +--- + +## 8. Page + +**Purpose.** One PDF page's structural content, in reading order. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2; `canonical_path = "/page/{page_number}"`. | +| `page_number` | int | yes | Marker-observed | Zero-indexed, matching Marker's own page numbering. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Marker Page block's id]`. | +| `children` | list of (`Section` \| `Paragraph` \| `Table` \| `Figure` \| `Equation` \| `Footnote` \| `PageHeader` \| `PageFooter`) | yes (may be empty) | Marker-observed | Top-level content of the page, in final reading order (Section 3.4). A discriminated union over `block_type`-equivalent kinds, mirroring (but not reusing) Marker's own children-array structure. | +| `is_front_matter` | bool | yes | Marker-observed (heuristic) | True if this page was identified as publisher wrapper content (journal cover, "Submit your article," ISSN-only content, etc.) rather than paper body. | + +**On `is_front_matter`.** Empirical finding 3.8 established that Marker gives no +structural signal distinguishing a wrapper page from a content page — both use +identical block types. This flag is therefore explicitly a **heuristic output of +the Normalizer** (content-pattern based, e.g. presence of "ISSN," "Submit your +article," near-total absence of citation-bearing text), not something copied from +Marker. The field is included now, with its value to be computed later, because the +project's stated requirement is that the schema accommodate this known case without +redesign — per the same logic as the other deferred-population fields in this spec +(Section 22 collects all of them explicitly). + +**Why a discriminated union for `children` rather than `list[Any]` or one generic +`Block` type.** The Raw Marker Model deliberately uses one uniform envelope because +it must stay agnostic to block semantics. The Document Object's job is the opposite: +it exists specifically to make structural type distinctions (a `Table` is not +interchangeable with a `Paragraph` downstream). A discriminated union gives +consumers static type safety and keeps `Page`/`Section` children lists +self-describing in serialized JSON via the discriminator field, with no loss of +ordering since list order is itself the reading-order signal (Section 3.4). + +--- + +## 9. Section + +**Purpose.** A heading-governed grouping of content, derived from Marker's +`section_hierarchy` breadcrumbs (Section 3.5) rather than re-derived from text +pattern-matching on headings. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | `canonical_path` built from the governing `SectionHeader` Marker block's own path id. | +| `heading_text` | str | yes | Marker-observed | Verbatim text of the governing `SectionHeader` block. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the SectionHeader block's id]`. | +| `depth` | int | yes | Marker-observed (derived) | Position of this section's heading in the ordered `section_path` list (Section 3.5), zero-indexed from the outermost heading on the page/document. | +| `children` | list of (`Section` \| `Paragraph` \| `Table` \| `Figure` \| `Equation` \| `Footnote`) | yes (may be empty) | Marker-observed | Nested sub-sections and content governed by this heading, in reading order. A `Section` may contain further `Section` objects, giving the hierarchy genuine nesting rather than a flat list with a depth integer alone. | + +**Invariant.** Every leaf or container object elsewhere in the schema that carries +a non-empty `section_path` in its `StructuralProvenance` must have a corresponding +ancestor chain of `Section` objects matching that path exactly — this is guaranteed +by construction (both are derived from the same `section_hierarchy` source, per +Section 3.5) rather than checked as a runtime validator, but it is stated here as a +hard design invariant the Normalizer must not violate. + +**Why `SectionHeader` blocks that are really table/figure labels (e.g. a Marker +`SectionHeader` containing only `"Table 3"`, per Pattern B) do not become `Section` +objects.** Empirical finding 3.1 showed Marker uses the same `SectionHeader` +block type both for genuine paper sections (Methods, Results) and for bare-table +caption labels. The Normalizer must distinguish these by context — a +`SectionHeader` immediately followed by a `Text` block and then a `Table`, with no +intervening structural content, is a caption label being consumed into that +`Table`'s `Caption` (Section 12), not a new `Section`. This rule is recorded here so +the schema's `Section` object is understood to represent only genuine paper +sections; the disambiguation logic itself is Normalizer business logic (out of +scope for this document) but the *consequence* — that some `SectionHeader` Marker +blocks become part of a `Caption` rather than a `Section` — is a structural decision +the schema must support, and it does: `Caption.provenance.marker_block_ids` can +include a `SectionHeader` id (Section 12). + +--- + +## 10. Paragraph + +**Purpose.** A single block of body text — the most common leaf content type. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `text` | str | yes | Marker-observed | The block's inline HTML content from Marker, **as-is** (e.g. ``, `` tags preserved). | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the originating Text/ListItem block's id]`. | + +**Why `text` keeps inline HTML rather than being plain-text-stripped.** Empirical +finding (leaf block dump) confirmed Marker leaf blocks carry real semantic inline +markup (``, ``) directly in their content, not a side annotation. Stripping +it at the Document layer would be a one-way, lossy transformation performed before +any consumer has had a chance to decide whether that markup matters (e.g. a `` +emphasis inside a Methods paragraph could matter to the extraction layer's reasoning +about emphasis on a key term). Per invariant 1.4 (maximum available provenance) and +the general "never lose information without a consumer-side decision to do so" +principle, the Document layer preserves it verbatim; any stripping is a retrieval- +or extraction-layer concern, explicitly out of scope here. + +**Note on `ListItem`/`ListGroup`.** Marker's bibliography uses `ListGroup` containers +of `ListItem` leaves rather than `Text` blocks (this was the basis for separating +references structurally without text pattern-matching). A `ListItem` that is part of +a reference list is **not** modeled as a `Paragraph` — it becomes a `Reference` +(Section 17). A `Paragraph` is reserved for body-text `Text`/generic `ListItem` +content; the Normalizer disambiguates by parent context (a `ListGroup` under the +References section vs. elsewhere), again business logic out of scope here, but the +schema accommodates the distinct outcome via two separate object types. + +--- + +## 11. Caption (supporting type, embedded in Table and Figure) + +**Purpose.** A normalized representation of a table or figure's caption, +collapsing Marker's two empirically observed patterns into one shape. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `label` | Optional[str] | no | Marker-observed | E.g. `"Table 3"` or `"Figure 1"`. Present whenever a `Caption` block (Pattern A) or a `SectionHeader` label block (Pattern B) was found. | +| `text` | Optional[str] | no | Marker-observed | The descriptive caption sentence. From the `Caption` block's content (Pattern A) or the `Text` block immediately following the label (Pattern B). | +| `trailing_notes` | Optional[str] | no | Marker-observed | The trailing "Note: ..." `Text` block sometimes observed immediately after a `Table`, distinct from both `label`/`text` and from `Footnote` objects (Section 16). Kept as its own field because it was empirically observed to be part of the caption apparatus, not body text, but also not a true Marker `Footnote` block. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids` lists every contributing Marker block (one for Pattern A's single `Caption` block; two or three for Pattern B's `SectionHeader` + `Text` + optional trailing `Text`). Uses `contributing_bboxes` (Section 3.3) rather than a single `bbox` whenever more than one block contributes, since collapsing non-adjacent regions into one bbox would misrepresent the geometry. | + +**Why one normalized shape rather than preserving Marker's two patterns +separately in the schema.** This is the central case the project's "empirically +driven, not speculative" instruction is built around: both patterns were directly +observed (Pattern A on pages 6/10/12, Pattern B on page 7, per finding 3.1), so +normalizing them is not a hypothetical convenience — it is required because every +downstream consumer (extraction layer asking "what is this table about," review UI +displaying "the caption") needs one consistent shape regardless of which pattern the +source PDF happened to produce. Modeling them as two different optional sub-objects +instead would push that disambiguation work onto every consumer rather than once, +inside the Normalizer, where the empirical knowledge of the two patterns actually +lives. + +--- + +## 12. Table + +**Purpose.** A table's logical structure and its evidence-level cell geometry, +kept as two deliberately parallel representations per the empirical recommendation. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Table block's id]` (and the `TableGroup` id too, if Pattern A). | +| `caption` | Optional[`Caption`] | no | Marker-observed | Section 11. `None` only if no caption-bearing blocks were found adjacent to the table at all (not empirically observed in the representative paper, but not excluded as a possibility — captionless tables are not assumed impossible). | +| `raw_html` | str | yes | Marker-observed | The Table block's own `html` field, verbatim — the complete, correctly-nested `
...
` Marker produces. Treated as the **source of truth for logical structure** (rows, columns, header rows), per the empirical recommendation, precisely because reconstructing structure independently from cell geometry risks disagreeing with Marker's own (already correct) parse. | +| `rows` | list of `TableRow` | yes (may be empty) | Marker-observed (derived) | A structured parse of `raw_html`'s `` elements into row objects (Section 12.1), giving consumers row/column access without re-parsing HTML themselves. Derived from `raw_html`, not an independent reconstruction. | +| `cells` | list of `TableCell` | yes (may be empty) | Marker-observed | The flat list of Marker `TableCell` child blocks, retained **only** as evidence/geometry data (bbox, polygon, provenance) — explicitly not used to derive row/column structure, per the empirical recommendation that `raw_html` is structural truth and `TableCell` geometry is supplementary. | +| `footnote_ids` | list of str | yes (may be empty) | Architectural requirement (deferred population) | Ids of `Footnote` objects geometrically attached to this table (Section 16). Empty until the Normalizer's bbox-proximity heuristic (finding 3.2) runs; the field exists now so that heuristic's output has a defined home without later schema change. | + +### 12.1 TableRow (supporting type, embedded in `Table.rows`) + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `cells` | list of `TableRowCell` | yes | Marker-observed (derived) | Ordered left to right per the source ``. | + +### 12.2 TableRowCell (supporting type, embedded in `TableRow.cells`) + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `text` | str | yes | Marker-observed | Cell text content from the parsed ``/`` element, with any `` wrapper tag stripped and its content treated as equivalent plain text (per empirical finding 3.5 — Marker inconsistently wraps numerically identical `mean ± stderr` values in `` depending on OCR path; the schema does not preserve this distinction since it carries no structural meaning, only an OCR-routing artifact). | +| `is_header` | bool | yes | Marker-observed | True if the source element was ``, false for ``. | +| `structural_notes` | Optional[str] | no | Architectural requirement (deferred) | A free-text slot reserved for a Normalizer-attached structural annotation — most notably, a suspected merged-cell placeholder (empirical finding 3.4: Marker silently flattens merged header cells into duplicated rows with an empty filler cell, with no flag distinguishing this from a genuinely empty cell). The heuristic for populating this field is explicitly **not** decided in this specification — finding 3.4 was flagged as needing more representative papers before a reconstruction rule is chosen. The field is included as an open slot precisely so that decision can be made later without a schema change, consistent with the brief's instruction to accommodate known structural cases without redesign. | + +**Why `TableRowCell` does not have `row_index`/`col_index` integers.** These are +implicit in `Table.rows`' list-of-lists structure itself (a cell's row is its +containing `TableRow`'s position in `rows`; its column is its own position in +`cells`), so adding redundant integer fields would duplicate information already +present in list order, with no Marker-observed justification for storing it twice. + +**Why `TableRowCell` has no `rowspan`/`colspan` field.** Empirical finding 3.4 +confirmed Marker's HTML output never emits `rowspan`/`colspan` attributes even where +the source PDF visually has merged cells — it flattens instead. Adding a +`rowspan`/`colspan` field for a case never observed in Marker's actual output would +violate the "no speculative fields" instruction. If a future representative paper +demonstrates Marker does emit span attributes under some condition, this is the +single place such fields would be added. + +### 12.3 TableCell (supporting type, embedded in `Table.cells`; evidence-only) + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2; path derived from the Marker `TableCell` block's own id. | +| `text` | str | yes | Marker-observed | Verbatim cell content (not math-stripped here — this object is evidence/geometry, not the logical text consumers should read; `TableRowCell.text` is the cleaned version). | +| `bbox` | `BoundingBox` | yes | Marker-observed | Per-cell geometry, the entire reason this parallel representation is retained (evidence highlighting in the review UI). | +| `polygon` | `Polygon` | yes | Marker-observed | Mirrors `bbox`. | + +**Why this evidence-only `TableCell` and the logical `TableRowCell` are not unified +into one type.** Empirical finding 3.3 established these are genuinely two +different, only partially-corresponding representations Marker provides in +parallel — one (the `` HTML) has correct logical structure but no per-cell +geometry, the other (`TableCell` children) has per-cell geometry but no row/column +index. Forcing them into a single type would require either fabricating row/column +indices on the geometry side (an unverified bbox-clustering reconstruction the +findings explicitly flagged as risky) or discarding per-cell geometry on the logical +side (losing the evidence-highlighting capability entirely). Keeping them separate, +each true to what Marker actually provides, is the choice that adds no +unverified inference. **Open implementation note, not a schema decision:** +positionally correlating a given `TableRowCell` with its corresponding `TableCell` +(for evidence highlighting of a specific logical cell) is left to the Normalizer to +attempt via parse-order correspondence; this spec does not assert that +correspondence is guaranteed, since it was not empirically verified. + +--- + +## 13. Figure + +**Purpose.** A figure region, with its caption normalized the same way as `Table`. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Figure block's id]` (and `FigureGroup` id, if present). | +| `caption` | Optional[`Caption`] | no | Marker-observed | Section 11. Empirically, `FigureGroup` always pairs `[Figure, Caption]` in that order (mirroring `TableGroup`'s pairing, just with reversed order — confirmed, not assumed, per the findings doc). | +| `image_data` | Optional[bytes] | no | Marker-observed | Base64-decoded raster image content from Marker's `images` field, when present. Empirically, in the representative paper, only `Picture` blocks (journal logo, cover thumbnail) carried non-empty `images`; the one `Figure` block had `images: {}`. This field is therefore included (figures plausibly can carry raster data, and the project must not assume they never will) but its emptiness in the current evidence base is recorded explicitly in Section 22 as unconfirmed, not silently assumed resolved. | + +--- + +## 14. Equation + +**Purpose.** A mathematical expression block. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Equation block's id]`. | +| `raw_math` | str | yes | Marker-observed | The verbatim MathML-ish `` content, including any equation number embedded inline (e.g. `"DP = I - IR + P - ETc \pm VR, \qquad (1)"`), per empirical finding 3.7. | +| `equation_number` | Optional[str] | no | Architectural requirement (deferred) | A slot for the parsed-out equation number (e.g. `"1"`), since finding 3.7 confirmed Marker provides no separate field for it — any cross-reference resolution ("using equation (1)" in body text) requires parsing it out of `raw_math`. The parsing logic itself is out of scope for this spec; the field exists so its result has a defined home. | + +--- + +## 15. Footnote + +**Purpose.** A footnote block, with its attachment to a table or figure resolved +geometrically rather than structurally, per empirical finding 3.2. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Footnote block's id]`. Note: this block's own provenance never implies attachment — footnotes are flat page-level siblings, not children of any Table/Figure (finding 3.2), so attachment is recorded separately below. | +| `raw_text` | str | yes | Marker-observed | Verbatim footnote content. | +| `attached_object_id` | Optional[str] | no | Architectural requirement (deferred) | The id of the `Table` or `Figure` this footnote was determined to belong to, via the Normalizer's bbox-proximity heuristic ("nearest preceding Table/Figure on the same page by bbox y-position," per finding 3.2). `None` when unresolved — tracked in aggregate by `Statistics.unresolved_footnote_count` (Section 7). The heuristic itself is Normalizer logic, out of scope here; the field exists so its output, including the legitimate possibility of non-resolution, has a defined, queryable home. | + +**Why attachment is nullable rather than required.** Forcing every footnote to +resolve to a table/figure would hide genuine ambiguity (e.g. a footnote whose +geometric position is equidistant between two candidates, or a page-level +disclaimer footnote unrelated to any table) behind an incorrect best-guess. Per the +project's broader principle (already established for the IR: "fields that cannot be +resolved are marked with an unresolved status rather than silently filled"), the +same discipline applies at the structural layer: `None` is a legitimate, recorded +outcome, not an implementation gap to paper over. + +--- + +## 16. Reference (Bibliography Entry) + +**Purpose.** One bibliography entry, structurally distinguished from body text +because Marker represents references via `ListGroup`/`ListItem`, not `Text` blocks +(observed directly in the page_stats/structure walkthrough, not inferred from +content). + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the ListItem block's id]`. | +| `raw_text` | str | yes | Marker-observed | Verbatim reference entry text, including any inline markup Marker preserved. | + +**Boundary note.** Like `Metadata` (Section 5), `Reference` deliberately stops at +verbatim text. Parsing a reference string into author/year/journal/DOI fields is +citation-matching — a semantic operation belonging to the IR's `Citation` entity, +not this layer. This object's only job is to say "this `ListItem`, structurally, +is a bibliography entry, not body text," which is information Marker's block typing +already gives for free via the `ListGroup` container. + +--- + +## 17. PageHeader / PageFooter + +**Purpose.** Repeated journal running-header/footer content (e.g. the journal name +repeated on every page, or page-footer branding), retained for completeness and +front-matter heuristics (Section 8) but not expected to be consumed by extraction. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the PageHeader/PageFooter block's id]`. | +| `raw_text` | str | yes | Marker-observed | Verbatim content. | + +Modeled as two distinct types (`PageHeader`, `PageFooter`) rather than one generic +"running content" type, simply mirroring Marker's own distinct block types +one-to-one — there is no structural reason to merge them, and merging would lose +the type distinction Marker itself already makes. + +--- + +## 18. Cross-Object Relationships & Invariants Summary + +This section consolidates relationship rules stated piecemeal above, for a single +point of reference. + +- **Containment is exclusively via `children` lists and explicit id-reference + fields (`footnote_ids`, `attached_object_id`) — never via implicit ordering + conventions or id-string parsing.** Any consumer needing "what footnotes belong + to this table" reads `Table.footnote_ids`, never re-derives it from geometry + itself; the Normalizer computes that relationship exactly once. +- **Reading order is a single global property (Section 3.4), independent of any + per-object containment.** Two sibling objects under different `Section`s can be + compared for relative reading order via their `reading_order_index` without + needing to know anything about section nesting. +- **Every non-`Document` object carries exactly one `StructuralProvenance`,** which + is the only place Marker block ids appear outside of `ProcessingMetadata`'s + artifact reference. No object duplicates Marker ids elsewhere in its own fields. +- **No object type defined in this specification has a field referencing an IR, + retrieval, validation, or export concept**, satisfying invariant 1.6 by + construction — this is checked by inspection of this document, not by a runtime + rule, since it is a closed schema with a fixed object list. + +--- + +## 19. Serialization Requirements + +- All models use `model_config = ConfigDict(frozen=True, extra="forbid")`. Unlike + the Raw Marker Model (which intentionally used `extra="allow"` for forward + compatibility with unknown future Marker fields), the Document Object is the project's own + designed contract — an unexpected extra field here indicates a Normalizer bug, + not a benign future Marker addition, so it should fail loudly (`extra="forbid"`) + rather than silently passing through. +- `model_dump_json()` must be deterministic for a given object graph: field order + follows declaration order (Pydantic v2 default), list order follows the + semantically meaningful order already specified per field (reading order for + children, left-to-right for table cells, outermost-to-innermost for + `section_path`) — never a non-deterministic order like dict-hash order. + `datetime` fields serialize as ISO 8601 strings in UTC. +- Every model must round-trip losslessly through `model_dump()` → + `model_validate()` and `model_dump_json()` → `model_validate_json()`, mirroring + the test discipline already established and passing for the Raw Marker Model. +- Bytes fields (`Figure.image_data`) serialize as base64 strings in JSON, matching + Marker's own convention for `images`, so no separate encoding scheme is + introduced at this layer. + +--- + +## 20. Validation Rules + +These are construction-time invariants enforced by each model's own validators, +distinct from the cross-cutting invariants in Section 1 (which are policies the +Normalizer must follow, not all individually mechanically checkable). + +- `BoundingBox`: `x1 >= x0` and `y1 >= y0`. +- `StructuralProvenance`: `marker_block_ids` has at least one element; + `reading_order_index >= 0`; exactly one of `bbox` or `contributing_bboxes` (or + neither, for objects with genuinely no recoverable geometry) is populated — never + both, to avoid two disagreeing geometric claims about the same object. +- `Document`: `pages` non-empty; `page_number` values across `pages` are unique. +- `Page`: `page_number >= 0`. +- `Table`: if `rows` is non-empty, every `TableRow.cells` list has at least one + element (a row with zero cells is not a meaningful row — such input indicates a + parse error in `raw_html`, which should surface as a Normalizer-time error, not a + silently-accepted empty row in the Document Object). +- `Footnote`: no validation forces `attached_object_id` to be set — its absence is + valid by design (Section 15). +- `Statistics`: every count field is `>= 0`; `unresolved_footnote_count <= + footnote_count` (a basic sanity bound the model itself can check independent of + whatever produced the numbers). + +--- + +## 21. Explicitly Deferred — Not Modeled, By Design + +Per the instruction to avoid speculative fields, the following structural cases +identified during evaluation are **intentionally absent** from this schema rather +than represented with a guessed-at field shape, because the representative paper +set does not yet provide enough evidence to know what shape is correct: + +- **Multi-page table continuation.** Not observed in the representative paper (no + table spans a page break). No `continues_on_page` / `continuation_of_table_id` + field is added speculatively. When a representative paper exhibiting this is + evaluated, this section is where such a field would be added — as an addition, + not a redesign, since `Table` already has a stable `id` to reference. +- **Multi-panel figure decomposition.** The representative paper's Figure 3 has 10 + visually lettered sub-panels under one shared caption, but Marker recorded it as + a single flat `Figure` block with no internal panel structure. Since this is the + only data point (n=1) and it shows Marker *not* decomposing panels, no `panels: + list[FigurePanel]` field is added on the strength of a PDF-visual observation that + contradicts what Marker itself outputs. If a future paper shows Marker does + sometimes decompose panels, this is where that field would be introduced. +- **TableGroup-vs-bare-Table triggering condition.** Both patterns are modeled + (via `Caption`'s flexible provenance, Section 11), but *why* Marker chooses one + over the other (single table per region vs. dense multi-table page, per the one + data point available) is not encoded as a schema concept — it doesn't need to be, + since the Document Object normalizes both outcomes into the same `Caption` shape + regardless of cause. +- **Merged-cell reconstruction heuristic.** The *slot* (`TableRowCell.structural_ + notes`) exists (Section 12.2), but the specific rule for populating it (e.g. + "empty cell directly below a filled cell in the same column ⇒ merged-placeholder + suspected") is explicitly not decided here, per finding 3.4's own conclusion that + this needs more representative papers first. + +--- + +## 22. Summary — Field Origin Distribution + +A consolidated view of the distinction requested for this specification: how many +fields per object are direct Marker carry-overs versus existing purely to satisfy +an architectural requirement (provenance, determinism, reproducibility, +serialization) versus reserved as a deferred-population slot for a Normalizer +heuristic not yet designed. + +| Object | Marker-observed fields | Architectural-requirement fields | Deferred-population slots | +|---|---|---|---| +| Document | `pages` | `id`, `source_pdf_identifier`, `metadata`, `processing_metadata`, `statistics` | — | +| Metadata | `title`, `page_count`, `has_front_matter_page` | — | — | +| ProcessingMetadata | `marker_version` | `normalizer_version`, `processed_at`, `source_marker_artifact_ref` | — | +| Statistics | — | all count fields | — | +| Page | `page_number`, `children`, `is_front_matter` (heuristic) | `id`, `provenance` | — | +| Section | `heading_text`, `depth`, `children` | `id`, `provenance` | — | +| Paragraph | `text` | `id`, `provenance` | — | +| Caption | `label`, `text`, `trailing_notes` | `provenance` | — | +| Table | `raw_html`, `rows`, `cells`, `caption` | `id`, `provenance` | `footnote_ids` | +| TableRowCell | `text`, `is_header` | — | `structural_notes` | +| TableCell | `text`, `bbox`, `polygon` | `id` | — | +| Figure | `caption`, `image_data` | `id`, `provenance` | — | +| Equation | `raw_math` | `id`, `provenance` | `equation_number` | +| Footnote | `raw_text` | `id`, `provenance` | `attached_object_id` | +| Reference | `raw_text` | `id`, `provenance` | — | +| PageHeader/PageFooter | `raw_text` | `id`, `provenance` | — | + +--- + +## 23. Exit Criteria for This Specification + +This specification is ready to be frozen and handed to implementation once: + +1. Every object above has a 1:1 or many:1 mapping back to either an observed + Marker block type or a named architectural requirement (satisfied throughout + this document via the "Origin" column on every field table). +2. No field exists whose justification is "might be useful later" rather than + "Marker provides this" or "the architecture requires this for provenance / + determinism / reproducibility / serialization / validation" (satisfied; the one + category that looks speculative — deferred-population slots — is explicitly + justified by the stated requirement to avoid future schema redesign, and is + listed exhaustively in Section 22's third column plus Section 21's explicit + exclusions). +3. No object or field encodes scientific meaning (satisfied — verified against + invariant 1.2 by inspection of the full object list in Sections 4–17). +4. Implementing this specification in Pydantic v2 requires translation, not new + design decisions — the identifier rule (Section 2), provenance rule (Section + 3.3), reading-order rule (Section 3.4), and every per-object field table are + concrete enough to type directly. + +Once reviewed and approved, the next phase is the mechanical translation of this +document into immutable Pydantic v2 models, followed by the Normalizer that +populates them from the Raw Marker Model. diff --git a/docs/document_schema_specification_v1.1.md b/docs/document_schema_specification_v1.1.md new file mode 100644 index 0000000..bbf8deb --- /dev/null +++ b/docs/document_schema_specification_v1.1.md @@ -0,0 +1,826 @@ +# Document Schema Specification + +**Project:** LLM-Assisted Extraction of Agronomic and Ecological Experiments into Structured Data +**Layer:** Document Understanding Layer (Document Object only) +**Version:** 1.1 +**Status:** Frozen — implementation contract +**Supersedes:** Version 1.0 (frozen, preserved unmodified as the historical approved specification at `document_schema_specification_v1.0.md`) +**Predecessor artifact:** Raw Marker Model (`marker_adapter/raw_model.py`), frozen +**Empirical basis:** `Marker Output — Empirical Findings (Paper 1: Nutrient Cycling, Smukler et al. 2012)` + +This document is an engineering specification, not an implementation. It defines every +object in the canonical Document layer — its purpose, fields, types, relationships, +identifier rule, provenance rule, validation rules, and serialization requirements — +so that implementing the Pydantic v2 Document Object later requires translation, not +design. No parsing, normalization, or business logic is described here; only the shape +of the data those future components will populate. + +--- + +## Changelog + +**Version 1.1** (current). Resolves a structural omission discovered during +implementation of `Section` (Document Understanding Layer build): `Reference` was +defined as a top-level object type with a `Statistics.reference_count` field implying +its instances are countable across the document tree, but no `Page.children` or +`Section.children` union in Version 1.0 included `Reference` as a member — leaving +`Reference` objects with no defined place in the structural tree. This is a minimal, +scoped correction, not a re-opening of the schema design: + +- `Reference` is added as a valid member of `Section.children` (Section 9). It is + **not** added to `Page.children` (Section 8) — references are always governed by a + "References" heading, structurally identical to any other body section, so this is + the same containment path every other Section-only-bound object would take, not a + new containment rule. +- `NodeKind.REFERENCE` is added so `Reference` can participate in the same + discriminated-union mechanism every other `Section.children` member already uses. +- `Reference` (Section 16) gains a required `kind` discriminator field, following the + exact pattern already used by every other discriminated-union member (`Section`, + `Paragraph`, `Table`, `Figure`, `Equation`, `Footnote`). + +No other field, object, relationship, or invariant defined in Version 1.0 is altered. +Version 1.0 remains the historical record of what was originally approved; Version 1.1 +is the current implementation contract. + +--- + +## 0. Relationship to the Raw Marker Model + +The Raw Marker Model is a lossless, uninterpreted mirror of Marker's JSON output: a +single `MarkerBlock` envelope type, `extra="allow"`, no discriminated union, no +semantic interpretation. The Document Object described here is the next layer down +the pipeline: it is produced *from* the Raw Marker Model by the Normalizer (not yet +implemented) and is the first place where **structural** interpretation occurs — +deciding what counts as a Table, a Section, a Footnote's likely attachment — while +still containing zero scientific meaning. + +Every Document-layer object therefore exists in addition to, not instead of, the Raw +Marker Model. The Raw Marker Model remains the permanent ground truth on disk; +the Document Object is a derived, queryable, typed structural view over it. This +specification assumes the Raw Marker Model is available as an immutable input and +focuses entirely on what the Normalizer must produce from it. + +--- + +## 1. Architectural Invariants + +These rules are not per-object — they govern the entire Document layer and every +object defined below conforms to them without restating them per-section. + +**1.1 Immutability.** Every Document-layer model is frozen after construction +(Pydantic v2 `model_config = ConfigDict(frozen=True)`). No object is mutated after +the Normalizer finishes building it. Corrections, re-interpretation, or review +happen in later layers (IR, Scientist Review) and never write back into the +Document Object. + +**1.2 Structural-only content.** No Document-layer object may contain a field whose +purpose is to record scientific meaning. Concretely: no `treatment`, `species`, +`observation`, `management_event`, `variable`, or `trait` field exists anywhere in +this schema, even as an optional placeholder. If a future need arises to record such +a concept, it belongs in the IR, which is built on top of — never inside — the +Document Object. + +**1.3 Deterministic identifiers.** No identifier in this schema is a UUID4 or any +other non-deterministic value. Every identifier is a pure function of stable inputs, +so that re-running the same PDF through the same Marker version and the same +Normalizer version always yields byte-identical identifiers. The exact construction +rule is given in Section 2. + +**1.4 Maximum available provenance.** Every object that originates from one or more +Marker blocks retains a `StructuralProvenance` value (Section 3.3) referencing the +originating Marker block id(s), page number, bounding box, polygon, and reading-order +position. An object is never permitted to "lose" its Marker origin even when the +Normalizer reshapes or merges multiple Marker blocks into one Document object (e.g. +turning a `SectionHeader` + `Text` pair into one normalized `Caption`). + +**1.5 Deterministic, lossless serialization.** Every model in this schema must +support `model_dump()` / `model_dump_json()` and round-trip back through +`model_validate()` without information loss, exactly as already verified for the +Raw Marker Model. Field ordering in dumped JSON is determined by declaration order +in the Pydantic model (not insertion order at runtime) to keep serialized output +byte-stable across runs. + +**1.6 Independence from downstream layers.** Nothing in this schema imports from, +references, or anticipates retrieval, LLM extraction, validation, the IR, or BETYdb +export. The Document Object's public surface is consumed by those layers, but this +schema has zero knowledge of them. + +--- + +## 2. Identifier Strategy + +**Rule.** Every Document-layer object's `id` is computed as: + +``` +id = "doc:" + sha256( document_id + "|" + canonical_path )[:16] +``` + +where `document_id` is the parent Document's own id (Section 4), and +`canonical_path` is a deterministic structural path string specific to each object +type, defined per-object below (generally derived from the originating Marker +block's own path-like id, e.g. `/page/7/Table/2`, when one exists 1:1; or, for objects +synthesized from multiple Marker blocks or with no direct Marker counterpart — such +as a parsed `TableRow` — a path built from the parent object's id plus an ordinal +position among deterministically-ordered siblings, e.g. `.../Table/2/row/3`). + +**Why a hash rather than reusing Marker's path id directly.** Marker's own ids +(`/page/7/Table/2`) are positional/index-based — `Table/2` means "third +Table-typed block encountered in that page's traversal." If a future Marker version +changes internal traversal order, encounters a new block type, or reorders block +discovery, these indices could silently shift between runs on an unchanged PDF, +producing different "stable" ids for the same content. Hashing a path that is +itself still derived from Marker's structure, combined with the document id, +preserves determinism for a fixed Marker/adapter version while making the contract +explicit: **stability is guaranteed within one Marker version, not promised across +Marker upgrades.** The original Marker path is never discarded — it is preserved +verbatim inside every object's `StructuralProvenance.marker_block_id` — so a Marker +version bump that changes traversal order is detectable (ids change) and +diagnosable (provenance still shows the old vs. new Marker ids). + +**document_id construction.** `document_id = "betydoc:" + sha256(source_pdf_identifier)[:16]`, +where `source_pdf_identifier` is a stable external identifier for the source PDF +(DOI if known, else a content hash of the source PDF bytes). This deliberately +excludes Marker version and timestamp from the identity computation: the same PDF +must always resolve to the same `document_id` so that re-processing (e.g. after a +Normalizer bug fix) updates the same logical Document Object rather than minting an +unrelated one. Marker version and processing time are recorded as **metadata about +the materialization**, not folded into identity — see `ProcessingMetadata` (Section +6). + +**Properties guaranteed by this scheme:** +- Same PDF + same Marker version + same Normalizer version ⇒ identical ids + throughout the tree. +- Ids are opaque strings, safe to use as dictionary keys, filenames, or database + foreign keys. +- Every id is traceable backward to a concrete Marker block via + `StructuralProvenance`, satisfying invariant 1.4. + +--- + +## 3. Foundational Supporting Types + +These are not top-level entities; they are embedded value objects used throughout +the schema. + +### 3.1 BoundingBox + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `x0` | float | yes | Marker-observed | Left edge | +| `y0` | float | yes | Marker-observed | Top edge | +| `x1` | float | yes | Marker-observed | Right edge | +| `y1` | float | yes | Marker-observed | Bottom edge | + +Directly carried over from Marker's `bbox` (already typed as `MarkerBBox` in the Raw +Marker Model). Retained at the Document layer because footnote-to-table attachment, +evidence highlighting in the Scientist Review UI, and any future geometric +reconstruction (e.g. merged-cell heuristics) all require it. **Invariant:** `x1 >= +x0` and `y1 >= y0`; the Normalizer is responsible for not constructing a violating +instance, but the model also validates this on construction since the cost of +allowing silently-inverted boxes downstream is high (evidence UI would render +boxes wrong with no error signal). + +### 3.2 Polygon + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `points` | list of 4 `(float, float)` pairs | yes | Marker-observed | Carried over from Marker's `polygon` | + +Retained even though `BoundingBox` is derivable from it, because Marker provides +both independently and the polygon can in principle capture skew that an +axis-aligned bbox cannot. This is a direct empirical carry-over (already present and +typed in the Raw Marker Model) rather than a new design — Document-layer objects +simply forward it unchanged. No Document-layer object computes one from the other; +both come from Marker as-is. + +### 3.3 StructuralProvenance + +This is the single most important supporting type in the schema — it is what +satisfies invariant 1.4 for every object below. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `marker_block_ids` | list of str | yes (min length 1) | Marker-observed | The originating Marker block id(s), e.g. `["/page/7/Table/2"]`. A list, not a single value, because some Document objects (e.g. a normalized `Caption` under Pattern B) are synthesized from more than one Marker block. | +| `page_number` | int | yes | Marker-observed | The PDF page this object originates from. For objects spanning conceptually across the synthesis of multiple Marker blocks, this is the page of the primary/first contributing block. | +| `bbox` | `BoundingBox` | no | Marker-observed | Present for any object with a single, well-defined originating region. Absent for objects synthesized from multiple non-adjacent blocks where a single bbox would be misleading (e.g. a Pattern-B caption combining a `SectionHeader` far above a trailing `Text` note) — in that case `contributing_bboxes` is populated instead. | +| `contributing_bboxes` | list of `BoundingBox` | no | Marker-observed | Used instead of (or in addition to) `bbox` when more than one Marker block contributes geometry, preserving each one rather than collapsing them into a single misleading box. | +| `polygon` | `Polygon` | no | Marker-observed | Mirrors `bbox`'s optionality logic. | +| `reading_order_index` | int | yes | Architectural requirement | The object's position in the document's global linear reading order (Section 3.4). Required on every provenance instance because every structural object has a place in reading order even if its bbox is ambiguous. | +| `section_path` | list of str | yes (may be empty) | Marker-observed (derived) | The chain of governing `SectionHeader` Marker-block ids from Marker's own `section_hierarchy` map, ordered outermost to innermost. Empty only for objects outside any section (e.g. a journal wrapper page's `Picture`). | + +**Why a list of Marker block ids rather than exactly one.** Empirically, not every +Document-layer concept maps 1:1 to a Marker block. The clearest case is `Table` +captions: under Pattern A (`TableGroup`), the caption is one `Caption` block; under +Pattern B (bare `Table`), the equivalent information is split across a +`SectionHeader` block and a `Text` block, sometimes with a trailing "Note:" `Text` +block. Forcing a single-id provenance field would require silently picking one +contributing block and losing the others. A list preserves all of them, satisfying +invariant 1.4 even when normalization merges several Marker blocks into one +Document concept. + +### 3.4 Reading Order + +Reading order is **not** a field on a supporting type — it is a global integer +sequence assigned by the Normalizer to every leaf and container object during +construction, equal to that object's position in a single depth-first traversal of +the final Document Object tree, in `children` array order. + +This decision is made explicitly here because it was a confirmed empirical finding, +not a default assumption: Marker's own block id local-index numbers (e.g. the +trailing `4` in `/page/7/Footnote/4`) are **not** monotonic with true reading order — +`Footnote/4` and `Footnote/5` physically appear, in the actual children array, after +`Table/8`, despite having lower index numbers. Reading order must therefore be +(re)computed by the Normalizer from final tree position, never inferred from Marker's +id numbering. `StructuralProvenance.reading_order_index` is this recomputed value, +not a copy of any number embedded in a Marker id string. + +### 3.5 Section Path + +`StructuralProvenance.section_path` is populated directly from Marker's own +`section_hierarchy` dict, which the empirical findings confirmed is already a +precomputed breadcrumb (e.g. a deeply nested `TableCell` carrying +`{'1': '/page/1/SectionHeader/1', '4': '/page/7/SectionHeader/0'}`). Two properties +of this dict are carried forward into the spec rather than assumed away: + +- **Depth keys are not contiguous small integers.** The observed keys were `'1'` + and `'4'`, not `'1'` and `'2'`, indicating these correspond to some absolute + nesting depth from Marker's internal traversal rather than a clean rank. The + Document schema therefore stores `section_path` as an **ordered list of + SectionHeader Marker-block ids** (sorted by the numeric value of their original + dict key) rather than preserving Marker's dict-with-gaps shape — this gives + downstream consumers (Retrieval layer, Section nesting) a clean, ordinary list + without forcing them to understand Marker's internal depth-key semantics. +- **The mapping is per-block, not per-Section-object.** Every Marker block — + including deeply nested ones like a `TableCell` — carries its own full path. The + Document Object's `Section` containment hierarchy (Section 9) is derived from this + same data, so `section_path` on any object and that object's ancestor `Section` + chain are guaranteed consistent by construction, not by a separate invariant check. + +--- + +## 4. Document + +**Purpose.** The root container for one processed paper. Holds the page sequence, +top-level metadata, processing metadata, and aggregate statistics. Exactly one +`Document` exists per source PDF per Normalizer run. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | `document_id`, Section 2. | +| `source_pdf_identifier` | str | yes | Architectural requirement | The stable external identifier used to compute `id` (DOI or content hash). Stored explicitly so the id's derivation is independently checkable, not just trusted. | +| `metadata` | `Metadata` | yes | Marker-observed + Architectural | Section 5. | +| `processing_metadata` | `ProcessingMetadata` | yes | Architectural requirement | Section 6. | +| `statistics` | `Statistics` | yes | Architectural requirement | Section 7. | +| `pages` | list of `Page` | yes (min length 1) | Marker-observed | Ordered by page number ascending; this ordering is also the top level of global reading order. | + +**Invariants.** +- `pages` is non-empty and sorted ascending by `Page.page_number` with no + duplicate or skipped page numbers other than what Marker itself reported (a + Marker-side page omission is preserved, not silently re-numbered). +- `Document` is the only object in this schema with no `StructuralProvenance` of + its own (there is no single Marker block representing "the whole document" — the + Raw Marker Model's root node was empirically confirmed to have no `id`, `bbox`, + or `polygon` at all). Its provenance is implicitly "the entire Raw Marker Model + file," which `processing_metadata.source_marker_artifact_ref` captures (Section + 6) rather than a `StructuralProvenance` instance. + +--- + +## 5. Metadata + +**Purpose.** Bibliographic and identification facts about the paper, to the extent +they are structurally recoverable (not semantically extracted — see the boundary +note below). + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `title` | Optional[str] | no | Marker-observed | Taken verbatim from the first/top-level `SectionHeader` or title-styled block on the front matter page, if structurally identifiable. | +| `page_count` | int | yes | Marker-observed | Count of `Page` objects; redundant with `len(pages)` but kept as an explicit field since `Statistics` (Section 7) is meant to hold *derived counts*, while this one is a basic identifying fact worth surfacing without traversing the tree. | +| `has_front_matter_page` | bool | yes | Marker-observed (heuristic) | Whether page 0 (or any page) was structurally flagged as publisher wrapper content. See `Page.is_front_matter` (Section 8) for the per-page flag this aggregates. | + +**Boundary note.** `Metadata` deliberately does **not** include authors, journal +name, publication year, or DOI as structured fields, even though these are +intuitively "metadata." Per the empirical findings (3.8), front-matter and +citation-bearing content is **structurally indistinguishable** from other text at +the block-type level — recovering "the authors" or "the journal" requires reading +and interpreting text content, which is scientific/semantic extraction, not +structural parsing. That work belongs to the IR's `Citation` entity (already +specified in the project's broader IR design), built by the extraction layer. This +spec only exposes what is mechanically true of the page structure (title block +location, page count, front-matter flag) — adding speculative `author`/`doi`/`year` +fields here would violate invariant 1.2 and the "no speculative fields" instruction, +since populating them correctly is not a structural operation. + +--- + +## 6. ProcessingMetadata + +**Purpose.** Records *how* this particular Document Object was produced, separate +from *what* it identifies (Section 4's `id`/`source_pdf_identifier`). This is what +makes reproducibility checkable: two Document Objects with the same `id` but +different `ProcessingMetadata` indicate the same paper was processed by a different +Marker or Normalizer version, which is exactly the signal needed to detect drift +without conflating it with document identity. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `marker_version` | str | yes | Marker-observed | Verbatim from Marker's own output metadata, if present; otherwise the version string of the Marker invocation recorded by the adapter. | +| `normalizer_version` | str | yes | Architectural requirement | Semantic version of the Normalizer code that produced this Document Object. Required so a future schema/logic change is always attributable. | +| `processed_at` | datetime (ISO 8601, UTC) | yes | Architectural requirement | Wall-clock time of this materialization. Explicitly **not** part of `id` computation (Section 2) — recorded for audit/debugging only. | +| `source_marker_artifact_ref` | str | yes | Architectural requirement | A path or content hash identifying the exact Raw Marker Model JSON file this Document Object was normalized from, satisfying the "Document has no own provenance" note in Section 4 by pointing at the file-level artifact instead of a block-level one. | + +**Why this is architectural rather than Marker-observed for most fields.** Only +`marker_version` comes from Marker itself; the rest exist purely because the +project's stated reproducibility requirement ("identical PDFs ... should always +produce identical Document Objects," and detectability of drift) demands a place to +record the inputs that determine reproducibility, even though Marker's own output +has no opinion on them. + +--- + +## 7. Statistics + +**Purpose.** Aggregate counts over the final Document Object tree, useful for +sanity-checking a Normalizer run (e.g. "did this paper produce zero tables when the +PDF clearly has six") without re-traversing the tree ad hoc. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `page_count` | int | yes | Architectural requirement (derived) | `len(pages)`. | +| `section_count` | int | yes | Architectural requirement (derived) | Total `Section` objects across the document. | +| `paragraph_count` | int | yes | Architectural requirement (derived) | Total `Paragraph` objects. | +| `table_count` | int | yes | Architectural requirement (derived) | Total `Table` objects. | +| `figure_count` | int | yes | Architectural requirement (derived) | Total `Figure` objects. | +| `equation_count` | int | yes | Architectural requirement (derived) | Total `Equation` objects. | +| `footnote_count` | int | yes | Architectural requirement (derived) | Total `Footnote` objects. | +| `reference_count` | int | yes | Architectural requirement (derived) | Total `Reference` objects. As of Version 1.1, `Reference` objects are reachable as `Section.children` members under a "References" `Section`, so this is a true traversal count, not a count over an out-of-tree collection. | +| `unresolved_footnote_count` | int | yes | Architectural requirement (derived) | Footnotes whose `attached_object_id` (Section 16) is `None` after Normalizer processing — a direct, queryable signal of how much of the geometric-attachment heuristic (empirical finding 3.2) succeeded on this paper. | + +**Why this object exists at all, given everything in it is derivable.** Every +field here is computable by traversal, so in principle `Statistics` adds no new +information. It exists as an explicit object — rather than leaving consumers to +compute it themselves — because (a) it gives a single, serializable snapshot for +logging/comparison across Normalizer runs without re-parsing the whole tree, and (b) +`unresolved_footnote_count` specifically operationalizes a concern raised directly +in the empirical findings (footnote attachment is a heuristic, not guaranteed) into +a number that can be tracked across the representative paper set as the Normalizer +is built and tuned. + +--- + +## 8. Page + +**Purpose.** One PDF page's structural content, in reading order. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2; `canonical_path = "/page/{page_number}"`. | +| `page_number` | int | yes | Marker-observed | Zero-indexed, matching Marker's own page numbering. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Marker Page block's id]`. | +| `children` | list of (`Section` \| `Paragraph` \| `Table` \| `Figure` \| `Equation` \| `Footnote` \| `PageHeader` \| `PageFooter`) | yes (may be empty) | Marker-observed | Top-level content of the page, in final reading order (Section 3.4). A discriminated union over `block_type`-equivalent kinds, mirroring (but not reusing) Marker's own children-array structure. | +| `is_front_matter` | bool | yes | Marker-observed (heuristic) | True if this page was identified as publisher wrapper content (journal cover, "Submit your article," ISSN-only content, etc.) rather than paper body. | + +**On `is_front_matter`.** Empirical finding 3.8 established that Marker gives no +structural signal distinguishing a wrapper page from a content page — both use +identical block types. This flag is therefore explicitly a **heuristic output of +the Normalizer** (content-pattern based, e.g. presence of "ISSN," "Submit your +article," near-total absence of citation-bearing text), not something copied from +Marker. The field is included now, with its value to be computed later, because the +project's stated requirement is that the schema accommodate this known case without +redesign — per the same logic as the other deferred-population fields in this spec +(Section 22 collects all of them explicitly). + +**On `Reference`'s absence from this union (post Version 1.1).** `Reference` was +added to `Section.children` in Version 1.1 but deliberately not to `Page.children`. +A `Reference` only ever appears under a governing "References" `Section`, never as a +direct child of `Page` with no intervening heading — consistent with how every other +content type in this schema reaches the page only by way of a `Section` once any +heading governs it. Adding `Reference` here as well would imply a second, headingless +containment path that no empirical observation supports. + +**Why a discriminated union for `children` rather than `list[Any]` or one generic +`Block` type.** The Raw Marker Model deliberately uses one uniform envelope because +it must stay agnostic to block semantics. The Document Object's job is the opposite: +it exists specifically to make structural type distinctions (a `Table` is not +interchangeable with a `Paragraph` downstream). A discriminated union gives +consumers static type safety and keeps `Page`/`Section` children lists +self-describing in serialized JSON via the discriminator field, with no loss of +ordering since list order is itself the reading-order signal (Section 3.4). + +--- + +## 9. Section + +**Purpose.** A heading-governed grouping of content, derived from Marker's +`section_hierarchy` breadcrumbs (Section 3.5) rather than re-derived from text +pattern-matching on headings. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | `canonical_path` built from the governing `SectionHeader` Marker block's own path id. | +| `heading_text` | str | yes | Marker-observed | Verbatim text of the governing `SectionHeader` block. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the SectionHeader block's id]`. | +| `depth` | int | yes | Marker-observed (derived) | Position of this section's heading in the ordered `section_path` list (Section 3.5), zero-indexed from the outermost heading on the page/document. | +| `children` | list of (`Section` \| `Paragraph` \| `Table` \| `Figure` \| `Equation` \| `Footnote` \| `Reference`) | yes (may be empty) | Marker-observed | Nested sub-sections and content governed by this heading, in reading order. A `Section` may contain further `Section` objects, giving the hierarchy genuine nesting rather than a flat list with a depth integer alone. `Reference` was added in Version 1.1 (see Changelog) — `ListItem` blocks under a "References" heading appear here as `Reference` children, exactly as any other content type appears under its governing `Section`. | + +**Invariant.** Every leaf or container object elsewhere in the schema that carries +a non-empty `section_path` in its `StructuralProvenance` must have a corresponding +ancestor chain of `Section` objects matching that path exactly — this is guaranteed +by construction (both are derived from the same `section_hierarchy` source, per +Section 3.5) rather than checked as a runtime validator, but it is stated here as a +hard design invariant the Normalizer must not violate. + +**Why `SectionHeader` blocks that are really table/figure labels (e.g. a Marker +`SectionHeader` containing only `"Table 3"`, per Pattern B) do not become `Section` +objects.** Empirical finding 3.1 showed Marker uses the same `SectionHeader` +block type both for genuine paper sections (Methods, Results) and for bare-table +caption labels. The Normalizer must distinguish these by context — a +`SectionHeader` immediately followed by a `Text` block and then a `Table`, with no +intervening structural content, is a caption label being consumed into that +`Table`'s `Caption` (Section 12), not a new `Section`. This rule is recorded here so +the schema's `Section` object is understood to represent only genuine paper +sections; the disambiguation logic itself is Normalizer business logic (out of +scope for this document) but the *consequence* — that some `SectionHeader` Marker +blocks become part of a `Caption` rather than a `Section` — is a structural decision +the schema must support, and it does: `Caption.provenance.marker_block_ids` can +include a `SectionHeader` id (Section 12). + +--- + +## 10. Paragraph + +**Purpose.** A single block of body text — the most common leaf content type. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `text` | str | yes | Marker-observed | The block's inline HTML content from Marker, **as-is** (e.g. ``, `` tags preserved). | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the originating Text/ListItem block's id]`. | + +**Why `text` keeps inline HTML rather than being plain-text-stripped.** Empirical +finding (leaf block dump) confirmed Marker leaf blocks carry real semantic inline +markup (``, ``) directly in their content, not a side annotation. Stripping +it at the Document layer would be a one-way, lossy transformation performed before +any consumer has had a chance to decide whether that markup matters (e.g. a `` +emphasis inside a Methods paragraph could matter to the extraction layer's reasoning +about emphasis on a key term). Per invariant 1.4 (maximum available provenance) and +the general "never lose information without a consumer-side decision to do so" +principle, the Document layer preserves it verbatim; any stripping is a retrieval- +or extraction-layer concern, explicitly out of scope here. + +**Note on `ListItem`/`ListGroup`.** Marker's bibliography uses `ListGroup` containers +of `ListItem` leaves rather than `Text` blocks (this was the basis for separating +references structurally without text pattern-matching). A `ListItem` that is part of +a reference list is **not** modeled as a `Paragraph` — it becomes a `Reference` +(Section 17). A `Paragraph` is reserved for body-text `Text`/generic `ListItem` +content; the Normalizer disambiguates by parent context (a `ListGroup` under the +References section vs. elsewhere), again business logic out of scope here, but the +schema accommodates the distinct outcome via two separate object types. + +--- + +## 11. Caption (supporting type, embedded in Table and Figure) + +**Purpose.** A normalized representation of a table or figure's caption, +collapsing Marker's two empirically observed patterns into one shape. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `label` | Optional[str] | no | Marker-observed | E.g. `"Table 3"` or `"Figure 1"`. Present whenever a `Caption` block (Pattern A) or a `SectionHeader` label block (Pattern B) was found. | +| `text` | Optional[str] | no | Marker-observed | The descriptive caption sentence. From the `Caption` block's content (Pattern A) or the `Text` block immediately following the label (Pattern B). | +| `trailing_notes` | Optional[str] | no | Marker-observed | The trailing "Note: ..." `Text` block sometimes observed immediately after a `Table`, distinct from both `label`/`text` and from `Footnote` objects (Section 16). Kept as its own field because it was empirically observed to be part of the caption apparatus, not body text, but also not a true Marker `Footnote` block. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids` lists every contributing Marker block (one for Pattern A's single `Caption` block; two or three for Pattern B's `SectionHeader` + `Text` + optional trailing `Text`). Uses `contributing_bboxes` (Section 3.3) rather than a single `bbox` whenever more than one block contributes, since collapsing non-adjacent regions into one bbox would misrepresent the geometry. | + +**Why one normalized shape rather than preserving Marker's two patterns +separately in the schema.** This is the central case the project's "empirically +driven, not speculative" instruction is built around: both patterns were directly +observed (Pattern A on pages 6/10/12, Pattern B on page 7, per finding 3.1), so +normalizing them is not a hypothetical convenience — it is required because every +downstream consumer (extraction layer asking "what is this table about," review UI +displaying "the caption") needs one consistent shape regardless of which pattern the +source PDF happened to produce. Modeling them as two different optional sub-objects +instead would push that disambiguation work onto every consumer rather than once, +inside the Normalizer, where the empirical knowledge of the two patterns actually +lives. + +--- + +## 12. Table + +**Purpose.** A table's logical structure and its evidence-level cell geometry, +kept as two deliberately parallel representations per the empirical recommendation. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Table block's id]` (and the `TableGroup` id too, if Pattern A). | +| `caption` | Optional[`Caption`] | no | Marker-observed | Section 11. `None` only if no caption-bearing blocks were found adjacent to the table at all (not empirically observed in the representative paper, but not excluded as a possibility — captionless tables are not assumed impossible). | +| `raw_html` | str | yes | Marker-observed | The Table block's own `html` field, verbatim — the complete, correctly-nested `
...
` Marker produces. Treated as the **source of truth for logical structure** (rows, columns, header rows), per the empirical recommendation, precisely because reconstructing structure independently from cell geometry risks disagreeing with Marker's own (already correct) parse. | +| `rows` | list of `TableRow` | yes (may be empty) | Marker-observed (derived) | A structured parse of `raw_html`'s `` elements into row objects (Section 12.1), giving consumers row/column access without re-parsing HTML themselves. Derived from `raw_html`, not an independent reconstruction. | +| `cells` | list of `TableCell` | yes (may be empty) | Marker-observed | The flat list of Marker `TableCell` child blocks, retained **only** as evidence/geometry data (bbox, polygon, provenance) — explicitly not used to derive row/column structure, per the empirical recommendation that `raw_html` is structural truth and `TableCell` geometry is supplementary. | +| `footnote_ids` | list of str | yes (may be empty) | Architectural requirement (deferred population) | Ids of `Footnote` objects geometrically attached to this table (Section 16). Empty until the Normalizer's bbox-proximity heuristic (finding 3.2) runs; the field exists now so that heuristic's output has a defined home without later schema change. | + +### 12.1 TableRow (supporting type, embedded in `Table.rows`) + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `cells` | list of `TableRowCell` | yes | Marker-observed (derived) | Ordered left to right per the source ``. | + +### 12.2 TableRowCell (supporting type, embedded in `TableRow.cells`) + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `text` | str | yes | Marker-observed | Cell text content from the parsed ``/`` element, with any `` wrapper tag stripped and its content treated as equivalent plain text (per empirical finding 3.5 — Marker inconsistently wraps numerically identical `mean ± stderr` values in `` depending on OCR path; the schema does not preserve this distinction since it carries no structural meaning, only an OCR-routing artifact). | +| `is_header` | bool | yes | Marker-observed | True if the source element was ``, false for ``. | +| `structural_notes` | Optional[str] | no | Architectural requirement (deferred) | A free-text slot reserved for a Normalizer-attached structural annotation — most notably, a suspected merged-cell placeholder (empirical finding 3.4: Marker silently flattens merged header cells into duplicated rows with an empty filler cell, with no flag distinguishing this from a genuinely empty cell). The heuristic for populating this field is explicitly **not** decided in this specification — finding 3.4 was flagged as needing more representative papers before a reconstruction rule is chosen. The field is included as an open slot precisely so that decision can be made later without a schema change, consistent with the brief's instruction to accommodate known structural cases without redesign. | + +**Why `TableRowCell` does not have `row_index`/`col_index` integers.** These are +implicit in `Table.rows`' list-of-lists structure itself (a cell's row is its +containing `TableRow`'s position in `rows`; its column is its own position in +`cells`), so adding redundant integer fields would duplicate information already +present in list order, with no Marker-observed justification for storing it twice. + +**Why `TableRowCell` has no `rowspan`/`colspan` field.** Empirical finding 3.4 +confirmed Marker's HTML output never emits `rowspan`/`colspan` attributes even where +the source PDF visually has merged cells — it flattens instead. Adding a +`rowspan`/`colspan` field for a case never observed in Marker's actual output would +violate the "no speculative fields" instruction. If a future representative paper +demonstrates Marker does emit span attributes under some condition, this is the +single place such fields would be added. + +### 12.3 TableCell (supporting type, embedded in `Table.cells`; evidence-only) + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2; path derived from the Marker `TableCell` block's own id. | +| `text` | str | yes | Marker-observed | Verbatim cell content (not math-stripped here — this object is evidence/geometry, not the logical text consumers should read; `TableRowCell.text` is the cleaned version). | +| `bbox` | `BoundingBox` | yes | Marker-observed | Per-cell geometry, the entire reason this parallel representation is retained (evidence highlighting in the review UI). | +| `polygon` | `Polygon` | yes | Marker-observed | Mirrors `bbox`. | + +**Why this evidence-only `TableCell` and the logical `TableRowCell` are not unified +into one type.** Empirical finding 3.3 established these are genuinely two +different, only partially-corresponding representations Marker provides in +parallel — one (the `` HTML) has correct logical structure but no per-cell +geometry, the other (`TableCell` children) has per-cell geometry but no row/column +index. Forcing them into a single type would require either fabricating row/column +indices on the geometry side (an unverified bbox-clustering reconstruction the +findings explicitly flagged as risky) or discarding per-cell geometry on the logical +side (losing the evidence-highlighting capability entirely). Keeping them separate, +each true to what Marker actually provides, is the choice that adds no +unverified inference. **Open implementation note, not a schema decision:** +positionally correlating a given `TableRowCell` with its corresponding `TableCell` +(for evidence highlighting of a specific logical cell) is left to the Normalizer to +attempt via parse-order correspondence; this spec does not assert that +correspondence is guaranteed, since it was not empirically verified. + +--- + +## 13. Figure + +**Purpose.** A figure region, with its caption normalized the same way as `Table`. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Figure block's id]` (and `FigureGroup` id, if present). | +| `caption` | Optional[`Caption`] | no | Marker-observed | Section 11. Empirically, `FigureGroup` always pairs `[Figure, Caption]` in that order (mirroring `TableGroup`'s pairing, just with reversed order — confirmed, not assumed, per the findings doc). | +| `image_data` | Optional[bytes] | no | Marker-observed | Base64-decoded raster image content from Marker's `images` field, when present. Empirically, in the representative paper, only `Picture` blocks (journal logo, cover thumbnail) carried non-empty `images`; the one `Figure` block had `images: {}`. This field is therefore included (figures plausibly can carry raster data, and the project must not assume they never will) but its emptiness in the current evidence base is recorded explicitly in Section 22 as unconfirmed, not silently assumed resolved. | + +--- + +## 14. Equation + +**Purpose.** A mathematical expression block. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Equation block's id]`. | +| `raw_math` | str | yes | Marker-observed | The verbatim MathML-ish `` content, including any equation number embedded inline (e.g. `"DP = I - IR + P - ETc \pm VR, \qquad (1)"`), per empirical finding 3.7. | +| `equation_number` | Optional[str] | no | Architectural requirement (deferred) | A slot for the parsed-out equation number (e.g. `"1"`), since finding 3.7 confirmed Marker provides no separate field for it — any cross-reference resolution ("using equation (1)" in body text) requires parsing it out of `raw_math`. The parsing logic itself is out of scope for this spec; the field exists so its result has a defined home. | + +--- + +## 15. Footnote + +**Purpose.** A footnote block, with its attachment to a table or figure resolved +geometrically rather than structurally, per empirical finding 3.2. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the Footnote block's id]`. Note: this block's own provenance never implies attachment — footnotes are flat page-level siblings, not children of any Table/Figure (finding 3.2), so attachment is recorded separately below. | +| `raw_text` | str | yes | Marker-observed | Verbatim footnote content. | +| `attached_object_id` | Optional[str] | no | Architectural requirement (deferred) | The id of the `Table` or `Figure` this footnote was determined to belong to, via the Normalizer's bbox-proximity heuristic ("nearest preceding Table/Figure on the same page by bbox y-position," per finding 3.2). `None` when unresolved — tracked in aggregate by `Statistics.unresolved_footnote_count` (Section 7). The heuristic itself is Normalizer logic, out of scope here; the field exists so its output, including the legitimate possibility of non-resolution, has a defined, queryable home. | + +**Why attachment is nullable rather than required.** Forcing every footnote to +resolve to a table/figure would hide genuine ambiguity (e.g. a footnote whose +geometric position is equidistant between two candidates, or a page-level +disclaimer footnote unrelated to any table) behind an incorrect best-guess. Per the +project's broader principle (already established for the IR: "fields that cannot be +resolved are marked with an unresolved status rather than silently filled"), the +same discipline applies at the structural layer: `None` is a legitimate, recorded +outcome, not an implementation gap to paper over. + +--- + +## 16. Reference (Bibliography Entry) + +**Purpose.** One bibliography entry, structurally distinguished from body text +because Marker represents references via `ListGroup`/`ListItem`, not `Text` blocks +(observed directly in the page_stats/structure walkthrough, not inferred from +content). + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `kind` | `Literal[NodeKind.REFERENCE]` | yes | Architectural requirement | Added in Version 1.1 (see Changelog). Discriminator for the `Section.children` union, following the same pattern as every other discriminated-union member. | +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the ListItem block's id]`. | +| `raw_text` | str | yes | Marker-observed | Verbatim reference entry text, including any inline markup Marker preserved. | + +**Boundary note.** Like `Metadata` (Section 5), `Reference` deliberately stops at +verbatim text. Parsing a reference string into author/year/journal/DOI fields is +citation-matching — a semantic operation belonging to the IR's `Citation` entity, +not this layer. This object's only job is to say "this `ListItem`, structurally, +is a bibliography entry, not body text," which is information Marker's block typing +already gives for free via the `ListGroup` container. + +--- + +## 17. PageHeader / PageFooter + +**Purpose.** Repeated journal running-header/footer content (e.g. the journal name +repeated on every page, or page-footer branding), retained for completeness and +front-matter heuristics (Section 8) but not expected to be consumed by extraction. + +| Field | Type | Required | Origin | Notes | +|---|---|---|---|---| +| `id` | str | yes | Architectural requirement | Per Section 2. | +| `provenance` | `StructuralProvenance` | yes | Marker-observed | `marker_block_ids = [the PageHeader/PageFooter block's id]`. | +| `raw_text` | str | yes | Marker-observed | Verbatim content. | + +Modeled as two distinct types (`PageHeader`, `PageFooter`) rather than one generic +"running content" type, simply mirroring Marker's own distinct block types +one-to-one — there is no structural reason to merge them, and merging would lose +the type distinction Marker itself already makes. + +--- + +## 18. Cross-Object Relationships & Invariants Summary + +This section consolidates relationship rules stated piecemeal above, for a single +point of reference. + +- **Containment is exclusively via `children` lists and explicit id-reference + fields (`footnote_ids`, `attached_object_id`) — never via implicit ordering + conventions or id-string parsing.** Any consumer needing "what footnotes belong + to this table" reads `Table.footnote_ids`, never re-derives it from geometry + itself; the Normalizer computes that relationship exactly once. +- **Reading order is a single global property (Section 3.4), independent of any + per-object containment.** Two sibling objects under different `Section`s can be + compared for relative reading order via their `reading_order_index` without + needing to know anything about section nesting. +- **Every non-`Document` object carries exactly one `StructuralProvenance`,** which + is the only place Marker block ids appear outside of `ProcessingMetadata`'s + artifact reference. No object duplicates Marker ids elsewhere in its own fields. +- **No object type defined in this specification has a field referencing an IR, + retrieval, validation, or export concept**, satisfying invariant 1.6 by + construction — this is checked by inspection of this document, not by a runtime + rule, since it is a closed schema with a fixed object list. + +--- + +## 19. Serialization Requirements + +- All models use `model_config = ConfigDict(frozen=True, extra="forbid")`. Unlike + the Raw Marker Model (which intentionally used `extra="allow"` for forward + compatibility with unknown future Marker fields), the Document Object is the project's own + designed contract — an unexpected extra field here indicates a Normalizer bug, + not a benign future Marker addition, so it should fail loudly (`extra="forbid"`) + rather than silently passing through. +- `model_dump_json()` must be deterministic for a given object graph: field order + follows declaration order (Pydantic v2 default), list order follows the + semantically meaningful order already specified per field (reading order for + children, left-to-right for table cells, outermost-to-innermost for + `section_path`) — never a non-deterministic order like dict-hash order. + `datetime` fields serialize as ISO 8601 strings in UTC. +- Every model must round-trip losslessly through `model_dump()` → + `model_validate()` and `model_dump_json()` → `model_validate_json()`, mirroring + the test discipline already established and passing for the Raw Marker Model. +- Bytes fields (`Figure.image_data`) serialize as base64 strings in JSON, matching + Marker's own convention for `images`, so no separate encoding scheme is + introduced at this layer. + +--- + +## 20. Validation Rules + +These are construction-time invariants enforced by each model's own validators, +distinct from the cross-cutting invariants in Section 1 (which are policies the +Normalizer must follow, not all individually mechanically checkable). + +- `BoundingBox`: `x1 >= x0` and `y1 >= y0`. +- `StructuralProvenance`: `marker_block_ids` has at least one element; + `reading_order_index >= 0`; exactly one of `bbox` or `contributing_bboxes` (or + neither, for objects with genuinely no recoverable geometry) is populated — never + both, to avoid two disagreeing geometric claims about the same object. +- `Document`: `pages` non-empty; `page_number` values across `pages` are unique. +- `Page`: `page_number >= 0`. +- `Table`: if `rows` is non-empty, every `TableRow.cells` list has at least one + element (a row with zero cells is not a meaningful row — such input indicates a + parse error in `raw_html`, which should surface as a Normalizer-time error, not a + silently-accepted empty row in the Document Object). +- `Footnote`: no validation forces `attached_object_id` to be set — its absence is + valid by design (Section 15). +- `Statistics`: every count field is `>= 0`; `unresolved_footnote_count <= + footnote_count` (a basic sanity bound the model itself can check independent of + whatever produced the numbers). + +--- + +## 21. Explicitly Deferred — Not Modeled, By Design + +Per the instruction to avoid speculative fields, the following structural cases +identified during evaluation are **intentionally absent** from this schema rather +than represented with a guessed-at field shape, because the representative paper +set does not yet provide enough evidence to know what shape is correct: + +- **Multi-page table continuation.** Not observed in the representative paper (no + table spans a page break). No `continues_on_page` / `continuation_of_table_id` + field is added speculatively. When a representative paper exhibiting this is + evaluated, this section is where such a field would be added — as an addition, + not a redesign, since `Table` already has a stable `id` to reference. +- **Multi-panel figure decomposition.** The representative paper's Figure 3 has 10 + visually lettered sub-panels under one shared caption, but Marker recorded it as + a single flat `Figure` block with no internal panel structure. Since this is the + only data point (n=1) and it shows Marker *not* decomposing panels, no `panels: + list[FigurePanel]` field is added on the strength of a PDF-visual observation that + contradicts what Marker itself outputs. If a future paper shows Marker does + sometimes decompose panels, this is where that field would be introduced. +- **TableGroup-vs-bare-Table triggering condition.** Both patterns are modeled + (via `Caption`'s flexible provenance, Section 11), but *why* Marker chooses one + over the other (single table per region vs. dense multi-table page, per the one + data point available) is not encoded as a schema concept — it doesn't need to be, + since the Document Object normalizes both outcomes into the same `Caption` shape + regardless of cause. +- **Merged-cell reconstruction heuristic.** The *slot* (`TableRowCell.structural_ + notes`) exists (Section 12.2), but the specific rule for populating it (e.g. + "empty cell directly below a filled cell in the same column ⇒ merged-placeholder + suspected") is explicitly not decided here, per finding 3.4's own conclusion that + this needs more representative papers first. + +--- + +## 22. Summary — Field Origin Distribution + +A consolidated view of the distinction requested for this specification: how many +fields per object are direct Marker carry-overs versus existing purely to satisfy +an architectural requirement (provenance, determinism, reproducibility, +serialization) versus reserved as a deferred-population slot for a Normalizer +heuristic not yet designed. + +| Object | Marker-observed fields | Architectural-requirement fields | Deferred-population slots | +|---|---|---|---| +| Document | `pages` | `id`, `source_pdf_identifier`, `metadata`, `processing_metadata`, `statistics` | — | +| Metadata | `title`, `page_count`, `has_front_matter_page` | — | — | +| ProcessingMetadata | `marker_version` | `normalizer_version`, `processed_at`, `source_marker_artifact_ref` | — | +| Statistics | — | all count fields | — | +| Page | `page_number`, `children`, `is_front_matter` (heuristic) | `id`, `provenance` | — | +| Section | `heading_text`, `depth`, `children` | `id`, `provenance` | — | +| Paragraph | `text` | `id`, `provenance` | — | +| Caption | `label`, `text`, `trailing_notes` | `provenance` | — | +| Table | `raw_html`, `rows`, `cells`, `caption` | `id`, `provenance` | `footnote_ids` | +| TableRowCell | `text`, `is_header` | — | `structural_notes` | +| TableCell | `text`, `bbox`, `polygon` | `id` | — | +| Figure | `caption`, `image_data` | `id`, `provenance` | — | +| Equation | `raw_math` | `id`, `provenance` | `equation_number` | +| Footnote | `raw_text` | `id`, `provenance` | `attached_object_id` | +| Reference | `raw_text` | `id`, `provenance` | — | +| PageHeader/PageFooter | `raw_text` | `id`, `provenance` | — | + +--- + +## 23. Exit Criteria for This Specification + +This specification is ready to be frozen and handed to implementation once: + +1. Every object above has a 1:1 or many:1 mapping back to either an observed + Marker block type or a named architectural requirement (satisfied throughout + this document via the "Origin" column on every field table). +2. No field exists whose justification is "might be useful later" rather than + "Marker provides this" or "the architecture requires this for provenance / + determinism / reproducibility / serialization / validation" (satisfied; the one + category that looks speculative — deferred-population slots — is explicitly + justified by the stated requirement to avoid future schema redesign, and is + listed exhaustively in Section 22's third column plus Section 21's explicit + exclusions). +3. No object or field encodes scientific meaning (satisfied — verified against + invariant 1.2 by inspection of the full object list in Sections 4–17). +4. Implementing this specification in Pydantic v2 requires translation, not new + design decisions — the identifier rule (Section 2), provenance rule (Section + 3.3), reading-order rule (Section 3.4), and every per-object field table are + concrete enough to type directly. + +Once reviewed and approved, the next phase is the mechanical translation of this +document into immutable Pydantic v2 models, followed by the Normalizer that +populates them from the Raw Marker Model. diff --git a/docs/jetstream_environment.md b/docs/jetstream_environment.md new file mode 100644 index 0000000..e5a21d0 --- /dev/null +++ b/docs/jetstream_environment.md @@ -0,0 +1,20 @@ +Hostname: bety-db-llm-gpu + +CPU: +16 AMD EPYC cores + +RAM: +58 GB + +GPU: +A100 20GB + +CUDA: +12.2 + +Storage: +484 GB local +9.8 TB shared mount (/software) + +Python: +3.12.3 \ No newline at end of file diff --git a/docs/marker_empirical_findings_paper1.md b/docs/marker_empirical_findings_paper1.md new file mode 100644 index 0000000..b2ab3ca --- /dev/null +++ b/docs/marker_empirical_findings_paper1.md @@ -0,0 +1,223 @@ +# Marker Output — Empirical Findings (Paper 1: Nutrient Cycling, Smukler et al. 2012) + +Source: `/mnt/user-data/uploads/1781681908897_Nutrient-cycling.json` +17 pages. Tree rooted at a single `Document` node. + +## 1. Universal block envelope + +Every node in the tree, container or leaf, shares the identical field set: + +``` +id : str e.g. "/page/7/Table/2" (path-like, encodes page + type + local index) +block_type : str e.g. "Table", "Text", "Page", "Document" +html : str either real inline HTML content (leaf), or a manifest of + pointers to children (container) +polygon : list 4 [x,y] corner points +bbox : list [x0, y0, x1, y1] +children : list | None nested block objects, or None for true leaves +section_hierarchy : dict {depth_str: section_header_id} — governing heading path +images : dict | None base64 image data keyed by this block's own id + (populated only for Picture blocks in this paper; {} otherwise) +``` + +No block type adds extra fields beyond this envelope. This means the schema's base +`MarkerBlock` type can be a single shape; specialization happens through `block_type` +plus type-specific *interpretation* of `html`/`children`, not through extra fields. + +## 2. Global block-type census (this paper) + +| block_type | count | notes | +|---------------|-------|-------| +| TableCell | 781 | always leaf, always child of a Table | +| Text | 105 | leaf; generic paragraph/caption-fragment/note | +| ListItem | 81 | leaf; reference entries | +| PageFooter | 49 | leaf; repeated journal footer per page | +| Page | 17 | container; one per PDF page | +| SectionHeader | 13 | leaf; includes real section headers AND table/figure labels ("Table 3") | +| Caption | 7 | leaf; only appears inside TableGroup/FigureGroup wrapping | +| Table | 7 | container of TableCells; html also contains a parallel full `
` HTML repr | +| Figure | 6 | leaf (no children); images field empty in this paper (no embedded raster found) | +| ListGroup | 6 | container of ListItems (reference list chunks) | +| Footnote | 5 | leaf; NOT nested under their related Table — flat page-level siblings | +| TableGroup | 3 | container: always exactly [Caption, Table] in that order | +| Picture | 2 | leaf; carries actual base64 raster in `images` (journal logo/cover thumbnail) | +| Document | 1 | root | +| PageHeader | 1 | leaf | +| FigureGroup | 1 | container: always exactly [Figure, Caption] in that order | +| Equation | 1 | leaf; html contains MathML-ish `` with the equation number INLINE in the math string | + +Note: `Span` and `Line`, which appeared in the `page_stats` summary block_counts, +do NOT appear anywhere in the actual tree. They are lower-level OCR primitives that +Marker collapses into the parent block's `html` string and does not expose as tree +nodes. The schema should not plan to consume Span/Line directly. + +## 3. Confirmed structural patterns + +### 3.1 Caption pairing is inconsistent across two different mechanisms + +**Pattern A — wrapped (TableGroup / FigureGroup):** +`TableGroup` and `FigureGroup` are containers whose children are ALWAYS exactly +`[Caption, Table]` or `[Figure, Caption]` respectively (order differs!: caption comes +*before* Table in TableGroup, *after* Figure in FigureGroup). Caption is a single +block of html content. + +**Pattern B — unwrapped (bare Table directly under Page):** +No TableGroup wrapper exists. Instead, caption information is split across TWO +separate sibling blocks immediately preceding the Table: + - a `SectionHeader` containing only the label, e.g. `

Table 3

` + - a `Text` block containing the full descriptive caption sentence +A trailing `Text` block ("Note: ...") may also follow the table — this is part of +the caption/footnote apparatus, not body text. + +Confirmed on page 7 (Tables 3 and 4, both unwrapped) vs. pages 6/10/12 (wrapped). +**The Document Schema must normalize both patterns into one canonical Table.caption +field** — the Normalizer needs a rule: "if TableGroup, caption = the Caption child's +text; if bare Table, caption = nearest-preceding SectionHeader text + nearest text +block before the Table, concatenated." + +### 3.2 Footnote-to-Table attachment requires geometric inference + +Footnotes are NEVER nested inside their related Table, and are NOT reliably +ordered/numbered adjacent to it either. On page 7: Table 3 (id .../Table/2), +Table 4 (id .../Table/8), and three Footnotes (ids .../Footnote/4, /5, /10) are +all flat siblings of the Page. + +The only reliable signal for attachment is **bbox y-coordinate ordering**: +Footnote/4 (y: 271–280) and Footnote/5 (y: 283–292) fall between Table 3's +"Note" text (y: 259–268) and Table 4's header (y: 342) → belong to Table 3. +Footnote/10 (y: 663–674) falls after Table 4's Note (y: 652–661) → belongs to Table 4. + +**Required Normalizer rule:** assign each Footnote to the nearest preceding +Table/Figure on the same page by bbox y-position, not by id adjacency or tree +nesting (id adjacency is NOT reliable — see Footnote ids 4,5 vs Table id 2, +and Footnote id 10 vs Table id 8; the numbering interleaves with other blocks). + +### 3.3 Tables carry two parallel, partially redundant representations + +A `Table` block's own `html` field contains a COMPLETE, correctly nested +`
...
...
...
` structure +with correct logical row/column grouping. + +Separately, the same Table also has N `TableCell` children, each with its own +bbox/polygon, but **no explicit row index or column index field** — row/column +membership is implicit and would need to be reconstructed by clustering bbox +y-ranges (rows) and x-ranges (columns) if cell-level geometry is needed. + +**Schema decision needed:** does the Document Object's canonical Table representation +parse structure from the `html` (reliable logical structure, no per-cell geometry), +from the `TableCell` children (per-cell geometry, structure must be inferred), or +both (html for logical truth, TableCell bboxes for evidence-highlighting only)? +Recommendation to evaluate against more papers: treat `html` as the source of +truth for logical table structure (rows/cols/headers), and TableCell geometry as +supplementary evidence-location data only, since reconstructing rows/cols from +bbox clustering independently risks disagreeing with Marker's own html parse. + +### 3.4 Merged/spanning cells are silently flattened, not marked + +Table 3 in the source PDF has merged row labels (e.g. "Irrigated Y1" / "(South Field)" +spans two visual rows as one label). Marker's output html does NOT use `rowspan`; +instead it duplicates the row structure and leaves the second row's corresponding +cell empty (``). There is no flag distinguishing "genuinely empty cell" +from "this is a placeholder for a merged cell above." This is invisible information +loss unless the schema explicitly accounts for it. + +**Open question for the Document Schema:** do we attempt to reconstruct rowspans +heuristically (empty cell directly below a filled cell in the same column = merged), +or do we accept Marker's flattening as-is and rely on the original row-group label +(e.g. "Irrigated Y1") being unambiguous from context alone? Needs testing against +more papers with merged cells before deciding — flagging as deferred per the +agreed scoping (don't over-design from one example). + +### 3.5 Inline math/value formatting is inconsistent within the same table + +Numerically identical value formats (`mean ± stderr`) appear sometimes as plain +text (`7.9 ± 0.1`) and sometimes wrapped in `7.9 \pm 0.1` within the +SAME table, seemingly depending on which OCR path (`pdftext` vs `surya`) produced +that specific cell. This matches the evaluation's noted concern about superscript/ +subscript/symbol OCR artifacts. + +**Required Normalizer rule:** strip/unwrap `` tags during cell text +extraction and treat their content as equivalent plain text — do not let table +schema or downstream parsing branch on whether a cell happens to contain a +`` wrapper. + +### 3.6 Significance markers are embedded in cell text, not structured + +Asterisks and daggers indicating statistical significance (`**`, `*`, `†`) are +appended directly to the numeric text inside table cells (e.g. `"64.5 ± 16.7**"`), +rather than being a separate, structured annotation. Linking a given cell's +significance marker to its meaning requires (a) parsing trailing marker characters +off the cell text, and (b) resolving them against the page's Footnote blocks +(per 3.2) which define what `*`, `**`, `†` mean for that table. + +**Required IR/Evidence implication:** a Measurement object extracted from such +a cell needs a place to carry a `significance_annotation` (raw marker + resolved +meaning), sourced from combining the cell text parse with the resolved footnote. + +### 3.7 Equation numbering is embedded in the math string, not a separate field + +The single Equation block in this paper has html: +`

DP = I - IR + P - ETc \pm VR, \qquad (1)

` + +The `(1)` equation number is part of the math content string itself. There is no +separate `equation_number` field. Any cross-reference resolution (body text says +"using equation (1)") will need to parse the number out of the math string. + +### 3.8 Front-matter / non-scientific content is structurally indistinguishable + +Page 0 (journal cover/routing page — ISSN, "Submit your article," "Article views: 133", +Taylor & Francis branding) uses the exact same block types (Picture, SectionHeader, +Text, Figure, PageFooter) as genuine content pages. Nothing in block_type or +structure flags this page as non-scientific front matter — Marker has no concept +of "this entire page is publisher wrapper, not part of the paper." This must be +detected, if needed, by content heuristics (presence of "ISSN", "Submit your +article", DOI-only content, etc.) or simply accepted into the Document Object +and filtered downstream during scientific extraction (retrieval layer would +simply never retrieve it because it's not relevant to any scientific query). + +### 3.9 `section_hierarchy` gives a live heading path per block + +Every block carries a `section_hierarchy` dict mapping a depth-index string to +the id of the governing SectionHeader at that depth — e.g. a TableCell deep +inside Table 3 carries `{'1': '/page/1/SectionHeader/1', '4': '/page/7/SectionHeader/0'}`, +meaning "under top-level heading from page 1 (likely 'Materials and Methods'), +under nearer heading from page 7 ('Table 3')." This is effectively a precomputed +breadcrumb and is very useful — the Document Object's Section nesting can likely +be derived directly from this rather than re-deriving it from reading order. + +## 4. Implications carried forward to Document Schema Specification + +1. Base `MarkerBlock` envelope is uniform — confirms a single ingestion parser + can handle all block types polymorphically by switching on `block_type`. +2. Table.caption must be normalized across two different Marker patterns (3.1). +3. Footnote attachment must be resolved by geometric proximity, not tree + structure or id adjacency (3.2) — Normalizer needs explicit bbox-based rule. +4. Table logical structure should likely be sourced from `html`, not reconstructed + from TableCell bboxes (3.3) — pending confirmation against more papers. +5. Merged-cell handling is an open/deferred question, not yet resolved (3.4). +6. `` wrapper inconsistency must be normalized away at ingestion (3.5). +7. Significance markers need a dedicated annotation slot in the IR, sourced from + combined cell-text-parsing + footnote-resolution (3.6). +8. Equation cross-references require number extraction from math string (3.7). +9. Front-matter detection is a content-heuristic problem, not structurally free (3.8). +10. `section_hierarchy` likely gives us Section nesting almost for free (3.9) — + worth designing the Normalizer to lean on this rather than re-deriving nesting. + +## 5. Still unverified / needs a second paper to confirm or refute + +- Is TableGroup-wrapping vs bare-Table-with-SectionHeader purely a function of + PDF layout (single table per region vs dense multi-table page), or something + else? (Page 7 has 2 dense tables back-to-back and got the bare pattern; pages + 6/10/12 have one table each and got TableGroup.) Single-paper evidence only. +- Does Figure ever carry actual `images` data, or was the empty `images: {}` we + saw specific to this paper's figures (which are likely vector/map graphics, + not raster photos)? The only non-empty `images` we found were on the two + Picture blocks (journal logo, cover thumbnail), not on any Figure block. +- Multi-page table continuation: NOT observed in this paper — no table spans + a page break here. Still an open edge case requiring a different example paper. +- Multi-panel figures with one shared caption (e.g. Figure 3's panels a–j): the + PDF clearly shows Figure 3 as 10 lettered sub-panels under one caption, but + Marker recorded it as a single `Figure` block (id /page/8/Figure/...) with + no internal panel structure. Need to confirm: does Marker ever decompose + multi-panel figures, or does it always flatten to one Figure block regardless + of internal panel count? This paper suggests always-flatten, but n=1. diff --git a/docs/paper_analysis_template.md b/docs/paper_analysis_template.md new file mode 100644 index 0000000..170b77c --- /dev/null +++ b/docs/paper_analysis_template.md @@ -0,0 +1,21 @@ +Citation + +Site + +Species + +Treatments + +Controls + +Management Events + +Traits/Yields + +Important Tables + +Important Figures + +Ambiguities + +Potential Extraction Challenges \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d591df2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "betydb-extraction" +version = "0.1.0" +requires-python = ">=3.12" +dependencies = ["pydantic>=2.0"] + +[tool.setuptools.packages.find] +where = ["src"] + +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9e901cc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +pandas +numpy +pydantic +pytest +jupyter +python-dotenv \ No newline at end of file diff --git a/src/betydb_extraction/__init__.py b/src/betydb_extraction/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/betydb_extraction/document/__init__.py b/src/betydb_extraction/document/__init__.py new file mode 100644 index 0000000..9f9e1a8 --- /dev/null +++ b/src/betydb_extraction/document/__init__.py @@ -0,0 +1,84 @@ +"""The Document Object: a structural, semantics-free typed view over one +processed paper, produced by the Normalizer from the Raw Marker Model. + +Implements the Document Schema Specification, Version 1.1. See +``docs/document_schema_specification_v1.1.md`` for the full contract; +``docs/document_schema_specification_v1.0.md`` is preserved as the +historical record of the originally approved specification. +""" +from __future__ import annotations + +from betydb_extraction.document.caption import Caption +from betydb_extraction.document.document import Document +from betydb_extraction.document.enums import NodeKind +from betydb_extraction.document.equation import Equation +from betydb_extraction.document.figure import Figure +from betydb_extraction.document.footnote import Footnote +from betydb_extraction.document.identifiers import ( + DOCUMENT_ID_PREFIX, + OBJECT_ID_PREFIX, + compute_document_id, + compute_object_id, + is_valid_document_id, + is_valid_object_id, + validate_document_id_shape, + validate_object_id_shape, +) +from betydb_extraction.document.metadata import Metadata +from betydb_extraction.document.page import Page, PageChild +from betydb_extraction.document.page_furniture import PageFooter, PageHeader +from betydb_extraction.document.paragraph import Paragraph +from betydb_extraction.document.processing_metadata import ProcessingMetadata +from betydb_extraction.document.provenance import ( + BoundingBox, + Polygon, + StructuralProvenance, +) +from betydb_extraction.document.reference import Reference +from betydb_extraction.document.section import Section, SectionChild +from betydb_extraction.document.statistics import Statistics +from betydb_extraction.document.table import Table, TableCell, TableRow, TableRowCell + +__all__ = [ + "BoundingBox", + "Caption", + "DOCUMENT_ID_PREFIX", + "Document", + "Equation", + "Figure", + "Footnote", + "Metadata", + "NodeKind", + "OBJECT_ID_PREFIX", + "Page", + "PageChild", + "PageFooter", + "PageHeader", + "Paragraph", + "Polygon", + "ProcessingMetadata", + "Reference", + "Section", + "SectionChild", + "Statistics", + "StructuralProvenance", + "Table", + "TableCell", + "TableRow", + "TableRowCell", + "compute_document_id", + "compute_object_id", + "is_valid_document_id", + "is_valid_object_id", + "validate_document_id_shape", + "validate_object_id_shape", +] + +# Resolve forward references for the recursive Section <-> SectionChild +# union and the Page -> Section dependency, in dependency order. Section +# itself already calls model_rebuild() at the end of section.py; this +# second pass ensures Page's own discriminated union (built after Section +# is already resolved) is consistent when the package is imported as a +# whole rather than module-by-module. +Page.model_rebuild() +Document.model_rebuild() \ No newline at end of file diff --git a/src/betydb_extraction/document/caption.py b/src/betydb_extraction/document/caption.py new file mode 100644 index 0000000..a9cd2e4 --- /dev/null +++ b/src/betydb_extraction/document/caption.py @@ -0,0 +1,75 @@ +"""The Caption supporting type. + +Implements the Document Schema Specification, Section 11 ("Caption +(supporting type, embedded in Table and Figure)"): a normalized +representation of a table or figure's caption, collapsing Marker's two +empirically observed patterns (Pattern A: a dedicated ``Caption`` block; +Pattern B: a ``SectionHeader`` label block followed by a ``Text`` block) +into one shape, so every downstream consumer needs to handle only one +caption shape regardless of which pattern the source PDF produced. + +``Caption`` is not itself a member of any ``Page``/``Section`` children +union -- it is always embedded inside a ``Table`` or ``Figure``, never a +standalone node in the reading-order tree, so it carries no ``kind`` +discriminator. +""" +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict, Field + +from betydb_extraction.document.provenance import StructuralProvenance + +__all__ = ["Caption"] + + +class Caption(BaseModel): + """A normalized table/figure caption, collapsing Marker's two patterns. + + Spec Section 11. ``label``, ``text``, and ``trailing_notes`` are all + independently optional rather than required-together: per the spec's + own per-field origin notes, each is populated only when the + corresponding Marker block was actually found, and the three are not + guaranteed to co-occur (e.g. Pattern A may have no trailing "Note:..." + block at all). + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + label: str | None = Field( + default=None, + description=( + "E.g. 'Table 3' or 'Figure 1'. Present whenever a Caption block " + "(Pattern A) or a SectionHeader label block (Pattern B) was " + "found." + ), + ) + text: str | None = Field( + default=None, + description=( + "The descriptive caption sentence. From the Caption block's " + "content (Pattern A) or the Text block immediately following " + "the label (Pattern B)." + ), + ) + trailing_notes: str | None = Field( + default=None, + description=( + "The trailing 'Note: ...' Text block sometimes observed " + "immediately after a Table, distinct from both label/text and " + "from Footnote objects (Section 15). Kept as its own field " + "because it was empirically observed to be part of the caption " + "apparatus, not body text, but also not a true Marker Footnote " + "block." + ), + ) + provenance: StructuralProvenance = Field( + description=( + "marker_block_ids lists every contributing Marker block (one " + "for Pattern A's single Caption block; two or three for " + "Pattern B's SectionHeader + Text + optional trailing Text). " + "Uses contributing_bboxes rather than a single bbox whenever " + "more than one block contributes, since collapsing " + "non-adjacent regions into one bbox would misrepresent the " + "geometry." + ) + ) \ No newline at end of file diff --git a/src/betydb_extraction/document/document.py b/src/betydb_extraction/document/document.py new file mode 100644 index 0000000..dead34d --- /dev/null +++ b/src/betydb_extraction/document/document.py @@ -0,0 +1,76 @@ +"""The Document model. + +Implements the Document Schema Specification, Section 4 ("Document"): the +root container for one processed paper. +""" +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + +from betydb_extraction.document.identifiers import validate_document_id_shape +from betydb_extraction.document.metadata import Metadata +from betydb_extraction.document.page import Page +from betydb_extraction.document.processing_metadata import ProcessingMetadata +from betydb_extraction.document.statistics import Statistics + +__all__ = ["Document"] + + +class Document(BaseModel): + """The root container for one processed paper. + + Spec Section 4. Exactly one ``Document`` exists per source PDF per + Normalizer run. Unlike every other object in this schema, ``Document`` + carries no ``StructuralProvenance`` of its own -- the Raw Marker + Model's root node was empirically confirmed to have no ``id``, + ``bbox``, or ``polygon`` at all, so there is no single Marker block + representing "the whole document." Its provenance is implicitly "the + entire Raw Marker Model file," captured by + ``processing_metadata.source_marker_artifact_ref`` instead. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + id: str = Field(description="document_id, per Spec Section 2.") + source_pdf_identifier: str = Field( + description=( + "The stable external identifier used to compute id (DOI or " + "content hash). Stored explicitly so the id's derivation is " + "independently checkable, not just trusted." + ) + ) + metadata: Metadata = Field(description="Spec Section 5.") + processing_metadata: ProcessingMetadata = Field(description="Spec Section 6.") + statistics: Statistics = Field(description="Spec Section 7.") + pages: list[Page] = Field( + min_length=1, + description=( + "Ordered by page number ascending; this ordering is also the " + "top level of global reading order." + ), + ) + + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_document_id_shape(value) + + @model_validator(mode="after") + def _check_pages_sorted_and_unique(self) -> "Document": + page_numbers = [page.page_number for page in self.pages] + + if len(page_numbers) != len(set(page_numbers)): + raise ValueError( + "Document.pages contains duplicate page_number values; " + "each page must have a unique page_number." + ) + + if page_numbers != sorted(page_numbers): + raise ValueError( + "Document.pages must be ordered ascending by page_number. " + "A Marker-side page omission (a gap in the sequence) is " + "permitted and preserved, not silently re-numbered -- but " + "the list order itself must still be ascending." + ) + + return self \ No newline at end of file diff --git a/src/betydb_extraction/document/enums.py b/src/betydb_extraction/document/enums.py new file mode 100644 index 0000000..5fa48a5 --- /dev/null +++ b/src/betydb_extraction/document/enums.py @@ -0,0 +1,45 @@ +"""Controlled-value enumerations used by the Document layer. + +The Document Schema Specification (Section 8) describes ``Page.children`` +(and, by the same logic, ``Section.children``) as "a discriminated union +... mirroring ... Marker's own children-array structure" and notes that +consumers should be able to identify each child's kind "via the +discriminator field" in serialized JSON. + +The specification names the *concept* of a discriminator field but does +not name a concrete field or its literal values, since that is a detail +of how the union is actually implemented in code rather than a structural +property of any one object. ``NodeKind`` is that concrete realization: a +closed, controlled set of values, one per concrete child type that can +appear in ``Page.children`` or ``Section.children``. This is a purely +technical necessity for building a working discriminated union in +Pydantic v2 -- it introduces no new structural concept beyond what +Section 8 already calls for. +""" + +from __future__ import annotations + +from enum import Enum + +__all__ = ["NodeKind"] + + +class NodeKind(str, Enum): + """Discriminator values for members of a Page's or Section's children. + + Each concrete Document-layer model that can appear inside + ``Page.children`` or ``Section.children`` carries a literal ``kind`` + field set to exactly one of these values, enabling Pydantic to + resolve the discriminated union without ambiguity and giving + serialized JSON a self-describing type tag. + """ + + SECTION = "section" + PARAGRAPH = "paragraph" + TABLE = "table" + FIGURE = "figure" + EQUATION = "equation" + FOOTNOTE = "footnote" + PAGE_HEADER = "page_header" + PAGE_FOOTER = "page_footer" + REFERENCE = "reference" \ No newline at end of file diff --git a/src/betydb_extraction/document/equation.py b/src/betydb_extraction/document/equation.py new file mode 100644 index 0000000..f559874 --- /dev/null +++ b/src/betydb_extraction/document/equation.py @@ -0,0 +1,57 @@ +"""The Equation model. + +Implements the Document Schema Specification, Section 14 ("Equation"). +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +from betydb_extraction.document.enums import NodeKind +from betydb_extraction.document.identifiers import validate_object_id_shape +from betydb_extraction.document.provenance import StructuralProvenance + +__all__ = ["Equation"] + + +class Equation(BaseModel): + """A mathematical expression block. + + Spec Section 14. ``raw_math`` preserves Marker's MathML-ish content + verbatim, including any equation number embedded inline within the + math string itself (e.g. ``"... (1)"``) -- Marker provides no separate + field for the equation number. ``equation_number`` is a reserved slot + for the parsed-out number; the parsing logic itself belongs to a later + phase, not this one. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + kind: Literal[NodeKind.EQUATION] = Field( + default=NodeKind.EQUATION, + description="Discriminator for Page/Section children unions.", + ) + id: str = Field(description="Deterministic identifier, per Spec Section 2.") + provenance: StructuralProvenance = Field( + description="marker_block_ids = [the Equation block's id]." + ) + raw_math: str = Field( + description=( + "The verbatim MathML-ish content, including any equation " + "number embedded inline." + ) + ) + equation_number: str | None = Field( + default=None, + description=( + "A slot for the parsed-out equation number, e.g. '1'. Population " + "logic is out of scope for this layer." + ), + ) + + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) \ No newline at end of file diff --git a/src/betydb_extraction/document/figure.py b/src/betydb_extraction/document/figure.py new file mode 100644 index 0000000..02e4013 --- /dev/null +++ b/src/betydb_extraction/document/figure.py @@ -0,0 +1,79 @@ +"""The Figure model. + +Implements the Document Schema Specification, Section 13 ("Figure"). +""" +from __future__ import annotations +import base64 +from typing import Literal +from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator +from betydb_extraction.document.caption import Caption +from betydb_extraction.document.enums import NodeKind +from betydb_extraction.document.identifiers import validate_object_id_shape +from betydb_extraction.document.provenance import StructuralProvenance +__all__ = ["Figure"] +class Figure(BaseModel): + """A figure region, with its caption normalized the same way as Table. + + Spec Section 13. Marker's ``FigureGroup`` always pairs + ``[Figure, Caption]`` in that order -- the reverse order from + ``TableGroup``'s ``[Caption, Table]`` -- which was confirmed + empirically, not assumed. + + ``image_data`` is included because figures plausibly can carry raster + data even though, in the one representative paper evaluated so far, + only ``Picture`` blocks (journal logo, cover thumbnail) carried + non-empty Marker ``images``, while the one observed ``Figure`` block + had empty ``images``. This is recorded as unconfirmed in the + specification (Section 22), not silently assumed resolved either way. + + Per Spec Section 19, bytes fields must serialize as base64 strings in + JSON. Pydantic v2's plain ``bytes`` type does not do this on its own + (it attempts a literal UTF-8 decode in JSON mode, which raises on + genuine binary image data) -- a ``field_serializer``/``field_validator`` + pair is used here to encode/decode explicitly, rather than reaching + for Pydantic's built-in ``Base64Bytes`` type, which decodes *input* as + base64 (the wrong direction for raw bytes assigned in Python) and does + not emit standard base64 on output either. + """ + model_config = ConfigDict(frozen=True, extra="forbid") + kind: Literal[NodeKind.FIGURE] = Field( + default=NodeKind.FIGURE, + description="Discriminator for Page/Section children unions.", + ) + id: str = Field(description="Deterministic identifier, per Spec Section 2.") + provenance: StructuralProvenance = Field( + description=( + "marker_block_ids = [the Figure block's id] (and FigureGroup id, " + "if present)." + ) + ) + caption: Caption | None = Field(default=None) + image_data: bytes | None = Field( + default=None, + description=( + "Base64-decoded raster image content from Marker's images field, " + "when present. Serializes as a base64 string in JSON per Spec " + "Section 19." + ), + ) + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) + + @field_serializer("image_data", when_used="json") + def _serialize_image_data(self, value: bytes | None) -> str | None: + if value is None: + return None + return base64.b64encode(value).decode("ascii") + + @field_validator("image_data", mode="before") + @classmethod + def _decode_image_data(cls, value): + # Accepts a base64 string (e.g. when re-hydrating from JSON) or + # raw bytes (e.g. direct Python construction by the Normalizer) + # interchangeably, so model_validate_json -> model_validate_json + # and direct construction both work uniformly. + if isinstance(value, str): + return base64.b64decode(value) + return value \ No newline at end of file diff --git a/src/betydb_extraction/document/footnote.py b/src/betydb_extraction/document/footnote.py new file mode 100644 index 0000000..be5b690 --- /dev/null +++ b/src/betydb_extraction/document/footnote.py @@ -0,0 +1,60 @@ +"""The Footnote model. + +Implements the Document Schema Specification, Section 15 ("Footnote"). +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +from betydb_extraction.document.enums import NodeKind +from betydb_extraction.document.identifiers import validate_object_id_shape +from betydb_extraction.document.provenance import StructuralProvenance + +__all__ = ["Footnote"] + + +class Footnote(BaseModel): + """A footnote block, attached to a Table/Figure geometrically. + + Spec Section 15. Footnotes are flat page-level siblings in Marker's + own output, never children of any Table/Figure -- so this object's + own ``provenance`` never implies attachment. Attachment is recorded + separately via ``attached_object_id``, populated later by the + Normalizer's bbox-proximity heuristic ("nearest preceding Table/Figure + on the same page by bbox y-position"). + + ``attached_object_id`` is deliberately nullable rather than required: + forcing every footnote to resolve to a table/figure would hide + genuine ambiguity behind an incorrect best guess. ``None`` is a + legitimate, recorded outcome -- consistent with the project's broader + principle that unresolved fields are marked as such rather than + silently filled. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + kind: Literal[NodeKind.FOOTNOTE] = Field( + default=NodeKind.FOOTNOTE, + description="Discriminator for Page/Section children unions.", + ) + id: str = Field(description="Deterministic identifier, per Spec Section 2.") + provenance: StructuralProvenance = Field( + description="marker_block_ids = [the Footnote block's id]." + ) + raw_text: str = Field(description="Verbatim footnote content.") + attached_object_id: str | None = Field( + default=None, + description=( + "The id of the Table or Figure this footnote was determined to " + "belong to. None when unresolved -- a legitimate outcome, not an " + "implementation gap." + ), + ) + + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) \ No newline at end of file diff --git a/src/betydb_extraction/document/identifiers.py b/src/betydb_extraction/document/identifiers.py new file mode 100644 index 0000000..3cc486e --- /dev/null +++ b/src/betydb_extraction/document/identifiers.py @@ -0,0 +1,148 @@ +"""Deterministic identifier generation utilities. + +Implements the identifier strategy defined in the Document Schema +Specification, Section 2 ("Identifier Strategy"), exactly as written: + + document_id = "betydoc:" + sha256(source_pdf_identifier)[:16] + id = "doc:" + sha256(document_id + "|" + canonical_path)[:16] + +These functions are pure and side-effect free: given the same inputs they +always return the same output, satisfying the specification's requirement +that identical PDFs processed through the same Marker and Normalizer +versions always produce identical Document Objects (Section 1.3). + +This module intentionally does *not* decide what a "canonical_path" string +should look like for any particular object (e.g. deriving one from a Marker +block's own path-like id). That decision is made by whatever component +constructs canonical_path strings from Raw Marker Model data -- the +Normalizer, which is out of scope for this implementation phase. This +module only implements the hashing contract itself, plus light-weight +validators confirming a given string conforms to the expected identifier +shape, so that model-level field validators can reject obviously malformed +ids without needing to know how any particular id was derived. +""" + +from __future__ import annotations + +import hashlib +import re + +__all__ = [ + "DOCUMENT_ID_PREFIX", + "OBJECT_ID_PREFIX", + "compute_document_id", + "compute_object_id", + "is_valid_document_id", + "is_valid_object_id", + "validate_document_id_shape", + "validate_object_id_shape", +] + +DOCUMENT_ID_PREFIX = "betydoc:" +OBJECT_ID_PREFIX = "doc:" + +_HASH_TRUNCATION_LENGTH = 16 + +# An identifier is the literal prefix followed by exactly 16 lowercase +# hexadecimal characters, per the truncation length used by both +# compute_document_id and compute_object_id below. +_DOCUMENT_ID_PATTERN = re.compile( + rf"^{re.escape(DOCUMENT_ID_PREFIX)}[0-9a-f]{{{_HASH_TRUNCATION_LENGTH}}}$" +) +_OBJECT_ID_PATTERN = re.compile( + rf"^{re.escape(OBJECT_ID_PREFIX)}[0-9a-f]{{{_HASH_TRUNCATION_LENGTH}}}$" +) + + +def compute_document_id(source_pdf_identifier: str) -> str: + """Compute the deterministic ``Document.id`` for a source PDF. + + Per Spec Section 2: a stable external identifier for the source PDF + (a DOI if known, otherwise a content hash of the source PDF bytes) + is hashed and truncated, never combined with Marker version or + processing timestamp, so that re-processing the same PDF always + resolves to the same logical Document Object. + + Args: + source_pdf_identifier: The stable external identifier for the + source PDF (e.g. a DOI string, or a hex content hash). + + Returns: + A string of the form ``"betydoc:" + 16 hex characters"``. + """ + digest = hashlib.sha256(source_pdf_identifier.encode("utf-8")).hexdigest() + return DOCUMENT_ID_PREFIX + digest[:_HASH_TRUNCATION_LENGTH] + + +def compute_object_id(document_id: str, canonical_path: str) -> str: + """Compute the deterministic id for any non-Document object. + + Per Spec Section 2: ``id = "doc:" + sha256(document_id + "|" + + canonical_path)[:16]``. The canonical_path is expected to already + have been derived (by the caller -- the Normalizer, in later phases) + from the originating Marker block's own path-like id, or from a + deterministic ordinal position among siblings for objects with no + single Marker block counterpart. + + Args: + document_id: The id of the parent Document, as returned by + ``compute_document_id``. + canonical_path: A deterministic structural path string unique + within the document. + + Returns: + A string of the form ``"doc:" + 16 hex characters"``. + """ + payload = f"{document_id}|{canonical_path}".encode("utf-8") + digest = hashlib.sha256(payload).hexdigest() + return OBJECT_ID_PREFIX + digest[:_HASH_TRUNCATION_LENGTH] + + +def is_valid_document_id(value: str) -> bool: + """Return True if ``value`` has the shape of a Document id. + + Used by model field validators to reject obviously malformed ids + at construction time, independent of how the id was produced. + """ + return bool(_DOCUMENT_ID_PATTERN.match(value)) + + +def is_valid_object_id(value: str) -> bool: + """Return True if ``value`` has the shape of a non-Document object id. + + Used by model field validators to reject obviously malformed ids + at construction time, independent of how the id was produced. + """ + return bool(_OBJECT_ID_PATTERN.match(value)) + + +def validate_document_id_shape(value: str) -> str: + """Raise ``ValueError`` if ``value`` is not a well-formed Document id. + + Thin wrapper around ``is_valid_document_id`` for direct use inside a + Pydantic ``field_validator``, which expects either a returned (possibly + transformed) value or a raised exception, not a boolean. + """ + if not is_valid_document_id(value): + raise ValueError( + f"{value!r} is not a valid Document id " + f"(expected '{DOCUMENT_ID_PREFIX}' + " + f"{_HASH_TRUNCATION_LENGTH} lowercase hex characters)" + ) + return value + + +def validate_object_id_shape(value: str) -> str: + """Raise ``ValueError`` if ``value`` is not a well-formed object id. + + Thin wrapper around ``is_valid_object_id`` for direct use inside a + Pydantic ``field_validator``, which expects either a returned (possibly + transformed) value or a raised exception, not a boolean. + """ + if not is_valid_object_id(value): + raise ValueError( + f"{value!r} is not a valid object id " + f"(expected '{OBJECT_ID_PREFIX}' + " + f"{_HASH_TRUNCATION_LENGTH} lowercase hex characters)" + ) + return value \ No newline at end of file diff --git a/src/betydb_extraction/document/metadata.py b/src/betydb_extraction/document/metadata.py new file mode 100644 index 0000000..ce74b3c --- /dev/null +++ b/src/betydb_extraction/document/metadata.py @@ -0,0 +1,54 @@ +"""The Metadata model. + +Implements the Document Schema Specification, Section 5 ("Metadata"): +bibliographic and identification facts about the paper, to the extent +they are structurally recoverable. + +Per the spec's boundary note, this model deliberately does **not** +include authors, journal name, publication year, or DOI: per empirical +finding 3.8, front-matter and citation-bearing content is structurally +indistinguishable from other text at the block-type level, so recovering +those facts is scientific/semantic extraction (IR's Citation entity, a +later phase), not structural parsing. Adding such fields here would be a +speculative field, which the specification disallows. +""" +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict, Field + +__all__ = ["Metadata"] + + +class Metadata(BaseModel): + """Bibliographic and identification facts recoverable at the structural layer. + + Spec Section 5. See the module docstring for why authors/journal/ + year/DOI are intentionally absent. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + title: str | None = Field( + default=None, + description=( + "Taken verbatim from the first/top-level SectionHeader or " + "title-styled block on the front matter page, if structurally " + "identifiable." + ), + ) + page_count: int = Field( + ge=0, + description=( + "Count of Page objects. Redundant with len(Document.pages) but " + "kept as an explicit field since Statistics is meant to hold " + "derived counts, while this is a basic identifying fact worth " + "surfacing without traversing the tree." + ), + ) + has_front_matter_page: bool = Field( + description=( + "Whether any page was structurally flagged as publisher wrapper " + "content. Aggregates Page.is_front_matter (Section 8) across all " + "pages." + ) + ) \ No newline at end of file diff --git a/src/betydb_extraction/document/page.py b/src/betydb_extraction/document/page.py new file mode 100644 index 0000000..a614283 --- /dev/null +++ b/src/betydb_extraction/document/page.py @@ -0,0 +1,88 @@ +"""The Page model. + +Implements the Document Schema Specification, Section 8 ("Page"): one +PDF page's structural content, in reading order. + +Per Version 1.1's note added to this section, ``Reference`` is +deliberately excluded from ``PageChild`` even though it was added to +``SectionChild`` (Spec Section 9). A ``Reference`` only ever appears +under a governing "References" ``Section``, never as a direct child of +``Page`` with no intervening heading. +""" +from __future__ import annotations + +from typing import Annotated, Union + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +from betydb_extraction.document.equation import Equation +from betydb_extraction.document.figure import Figure +from betydb_extraction.document.footnote import Footnote +from betydb_extraction.document.identifiers import validate_object_id_shape +from betydb_extraction.document.page_furniture import PageFooter, PageHeader +from betydb_extraction.document.paragraph import Paragraph +from betydb_extraction.document.provenance import StructuralProvenance +from betydb_extraction.document.section import Section +from betydb_extraction.document.table import Table + +__all__ = ["Page", "PageChild"] + + +PageChild = Annotated[ + Union[Section, Paragraph, Table, Figure, Equation, Footnote, PageHeader, PageFooter], + Field(discriminator="kind"), +] +"""The discriminated union of types that may appear in ``Page.children``. + +Per Spec Section 8's field table. Includes ``PageHeader``/``PageFooter``, +which are Page-only (running header/footer content does not occur nested +inside a paper ``Section``). Deliberately excludes ``Reference`` -- see +the module docstring and Spec Section 8's Version 1.1 note. +""" + + +class Page(BaseModel): + """One PDF page's structural content, in reading order. + + Spec Section 8. ``is_front_matter`` is a Normalizer heuristic output + (empirical finding 3.8: Marker gives no structural signal + distinguishing a wrapper page from a content page), not something + copied from Marker -- the field exists now, with its value computed + later, per the spec's deferred-population pattern (Section 22). + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + id: str = Field( + description=( + "Deterministic identifier, per Spec Section 2; " + "canonical_path = '/page/{page_number}'." + ) + ) + page_number: int = Field( + ge=0, + description="Zero-indexed, matching Marker's own page numbering.", + ) + provenance: StructuralProvenance = Field( + description="marker_block_ids = [the Marker Page block's id]." + ) + children: list[PageChild] = Field( + default_factory=list, + description=( + "Top-level content of the page, in final reading order (Spec " + "Section 3.4)." + ), + ) + is_front_matter: bool = Field( + description=( + "True if this page was identified as publisher wrapper content " + "(journal cover, 'Submit your article,' ISSN-only content, " + "etc.) rather than paper body. A Normalizer heuristic output, " + "not Marker-observed." + ) + ) + + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) \ No newline at end of file diff --git a/src/betydb_extraction/document/page_furniture.py b/src/betydb_extraction/document/page_furniture.py new file mode 100644 index 0000000..605b188 --- /dev/null +++ b/src/betydb_extraction/document/page_furniture.py @@ -0,0 +1,55 @@ +"""The PageHeader and PageFooter models. + +Implements the Document Schema Specification, Section 17 ("PageHeader / +PageFooter"). Modeled as two distinct types rather than one generic +"running content" type, mirroring Marker's own distinct block types +one-to-one -- there is no structural reason to merge them, and merging +would lose a type distinction Marker itself already makes. +""" +from __future__ import annotations +from typing import Literal +from pydantic import BaseModel, ConfigDict, Field, field_validator +from betydb_extraction.document.enums import NodeKind +from betydb_extraction.document.identifiers import validate_object_id_shape +from betydb_extraction.document.provenance import StructuralProvenance +__all__ = ["PageFooter", "PageHeader"] +class PageHeader(BaseModel): + """Repeated journal running-header content. + + Spec Section 17. Retained for completeness and front-matter + heuristics but not expected to be consumed by extraction. + """ + model_config = ConfigDict(frozen=True, extra="forbid") + kind: Literal[NodeKind.PAGE_HEADER] = Field( + default=NodeKind.PAGE_HEADER, + description="Discriminator for Page children unions.", + ) + id: str = Field(description="Deterministic identifier, per Spec Section 2.") + provenance: StructuralProvenance = Field( + description="marker_block_ids = [the PageHeader block's id]." + ) + raw_text: str = Field(description="Verbatim content.") + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) +class PageFooter(BaseModel): + """Repeated journal running-footer content. + + Spec Section 17. Retained for completeness and front-matter + heuristics but not expected to be consumed by extraction. + """ + model_config = ConfigDict(frozen=True, extra="forbid") + kind: Literal[NodeKind.PAGE_FOOTER] = Field( + default=NodeKind.PAGE_FOOTER, + description="Discriminator for Page children unions.", + ) + id: str = Field(description="Deterministic identifier, per Spec Section 2.") + provenance: StructuralProvenance = Field( + description="marker_block_ids = [the PageFooter block's id]." + ) + raw_text: str = Field(description="Verbatim content.") + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) \ No newline at end of file diff --git a/src/betydb_extraction/document/paragraph.py b/src/betydb_extraction/document/paragraph.py new file mode 100644 index 0000000..c030444 --- /dev/null +++ b/src/betydb_extraction/document/paragraph.py @@ -0,0 +1,55 @@ +"""The Paragraph model. + +Implements the Document Schema Specification, Section 10 ("Paragraph"): +a single block of body text, the most common leaf content type. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +from betydb_extraction.document.enums import NodeKind +from betydb_extraction.document.identifiers import validate_object_id_shape +from betydb_extraction.document.provenance import StructuralProvenance + +__all__ = ["Paragraph"] + + +class Paragraph(BaseModel): + """A single block of body text. + + Spec Section 10. ``text`` preserves Marker's inline HTML markup + verbatim (e.g. ````, ``
``) rather than being stripped to + plain text, per the spec's reasoning: stripping it here would be a + one-way, lossy transformation performed before any consumer has had + a chance to decide whether that markup matters, violating the + "maximum available provenance" invariant. Any stripping is a + retrieval- or extraction-layer concern, out of scope for this layer. + + Note: a ``ListItem`` that is part of a reference list is *not* + modeled as a Paragraph -- see ``Reference``. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + kind: Literal[NodeKind.PARAGRAPH] = Field( + default=NodeKind.PARAGRAPH, + description="Discriminator for Page/Section children unions.", + ) + id: str = Field(description="Deterministic identifier, per Spec Section 2.") + text: str = Field( + description=( + "The block's inline HTML content from Marker, as-is, including " + "any inline markup tags." + ) + ) + provenance: StructuralProvenance = Field( + description="marker_block_ids = [the originating Text/ListItem block's id]." + ) + + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) \ No newline at end of file diff --git a/src/betydb_extraction/document/processing_metadata.py b/src/betydb_extraction/document/processing_metadata.py new file mode 100644 index 0000000..b4d2ea7 --- /dev/null +++ b/src/betydb_extraction/document/processing_metadata.py @@ -0,0 +1,82 @@ +"""The ProcessingMetadata model. + +Implements the Document Schema Specification, Section 6 +("ProcessingMetadata"): records *how* a Document Object was produced, +separate from *what* it identifies (``Document.id`` / +``source_pdf_identifier``, Section 4). +""" +from __future__ import annotations + +from datetime import datetime, timezone + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +__all__ = ["ProcessingMetadata"] + + +class ProcessingMetadata(BaseModel): + """Records how a Document Object was produced. + + Spec Section 6. Only ``marker_version`` is Marker-observed; the rest + are architectural requirements that exist purely to make + reproducibility and drift checkable -- see the spec's "Why this is + architectural" note. None of these fields participate in identifier + computation (Section 2): two Document Objects with the same ``id`` + but different ``ProcessingMetadata`` signal the same paper processed + by a different Marker/Normalizer version, not a different document. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + marker_version: str = Field( + description=( + "Verbatim from Marker's own output metadata, if present; " + "otherwise the version string of the Marker invocation recorded " + "by the adapter." + ) + ) + normalizer_version: str = Field( + description=( + "Semantic version of the Normalizer code that produced this " + "Document Object. Required so a future schema/logic change is " + "always attributable." + ) + ) + processed_at: datetime = Field( + description=( + "Wall-clock time of this materialization, ISO 8601 UTC. " + "Explicitly not part of id computation (Section 2) -- recorded " + "for audit/debugging only." + ) + ) + source_marker_artifact_ref: str = Field( + description=( + "A path or content hash identifying the exact Raw Marker Model " + "JSON file this Document Object was normalized from, satisfying " + "the 'Document has no own provenance' note in Section 4 by " + "pointing at the file-level artifact instead of a block-level " + "one." + ) + ) + + @field_validator("processed_at") + @classmethod + def _check_processed_at_is_utc(cls, value: datetime) -> datetime: + # Spec 19 requires datetime fields to serialize as ISO 8601 strings + # in UTC. A naive datetime has no defined offset, so it cannot be + # asserted to *be* UTC -- it is rejected rather than silently + # assumed to be UTC, since silently assuming would risk a wrong + # audit timestamp under invariant 1.5 (deterministic, lossless + # serialization implies the value round-trips meaning intact, not + # just bytes). + if value.tzinfo is None: + raise ValueError( + "ProcessingMetadata.processed_at must be timezone-aware " + "(ISO 8601 UTC per Spec Section 19); got a naive datetime." + ) + if value.utcoffset() != timezone.utc.utcoffset(None): + raise ValueError( + "ProcessingMetadata.processed_at must be UTC per Spec " + f"Section 19; got offset {value.utcoffset()}." + ) + return value \ No newline at end of file diff --git a/src/betydb_extraction/document/provenance.py b/src/betydb_extraction/document/provenance.py new file mode 100644 index 0000000..c170973 --- /dev/null +++ b/src/betydb_extraction/document/provenance.py @@ -0,0 +1,150 @@ +"""Foundational supporting value objects: geometry and provenance. + +Implements the Document Schema Specification, Section 3 ("Foundational +Supporting Types"): ``BoundingBox`` (3.1), ``Polygon`` (3.2), and +``StructuralProvenance`` (3.3). These are not top-level entities; they are +immutable value objects embedded inside every other Document-layer model +that needs to record geometry or trace back to its originating Marker +block(s). +""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict, Field, model_validator + +__all__ = ["BoundingBox", "Polygon", "StructuralProvenance"] + + +class BoundingBox(BaseModel): + """An axis-aligned bounding box, carried over verbatim from Marker. + + Spec Section 3.1. Corresponds directly to Marker's own ``bbox`` field + (already typed as ``MarkerBBox`` in the Raw Marker Model). Retained at + the Document layer because footnote-to-table attachment, evidence + highlighting in the Scientist Review UI, and any future geometric + reconstruction all require it. + + Invariant: ``x1 >= x0`` and ``y1 >= y0``, enforced at construction + time so that an inverted box -- which would render incorrectly in any + downstream evidence UI with no error signal -- can never exist. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + x0: float = Field(description="Left edge.") + y0: float = Field(description="Top edge.") + x1: float = Field(description="Right edge.") + y1: float = Field(description="Bottom edge.") + + @model_validator(mode="after") + def _check_box_is_not_inverted(self) -> "BoundingBox": + if self.x1 < self.x0: + raise ValueError( + f"BoundingBox is inverted on the x-axis: x1={self.x1} < x0={self.x0}" + ) + if self.y1 < self.y0: + raise ValueError( + f"BoundingBox is inverted on the y-axis: y1={self.y1} < y0={self.y0}" + ) + return self + + +class Polygon(BaseModel): + """A four-point polygon, carried over verbatim from Marker. + + Spec Section 3.2. Retained even though a ``BoundingBox`` is in + principle derivable from it, because Marker provides both + independently and the polygon can capture skew that an axis-aligned + bbox cannot. No Document-layer object computes one from the other; + both are forwarded from Marker as-is. + + ``points`` is exactly four ``(x, y)`` pairs, matching the shape Marker + itself emits for its own ``polygon`` field. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + points: tuple[ + tuple[float, float], + tuple[float, float], + tuple[float, float], + tuple[float, float], + ] = Field(description="Exactly four (x, y) corner points, as emitted by Marker.") + + +class StructuralProvenance(BaseModel): + """Traces a Document-layer object back to its originating Marker block(s). + + Spec Section 3.3. This is the type that satisfies the architectural + invariant that every object preserve the finest provenance available + from Marker (Spec Section 1.4). Every non-``Document`` object in the + schema carries exactly one ``StructuralProvenance`` instance. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + marker_block_ids: list[str] = Field( + min_length=1, + description=( + "The originating Marker block id(s), e.g. ['/page/7/Table/2']. " + "A list rather than a single value because some Document objects " + "(e.g. a normalized Caption under Pattern B) are synthesized from " + "more than one Marker block." + ), + ) + page_number: int = Field( + description=( + "The PDF page this object originates from. For objects synthesized " + "from multiple blocks, the page of the primary/first contributing " + "block." + ) + ) + bbox: BoundingBox | None = Field( + default=None, + description=( + "Present for an object with a single, well-defined originating " + "region. Mutually exclusive with contributing_bboxes." + ), + ) + contributing_bboxes: list[BoundingBox] | None = Field( + default=None, + description=( + "Used instead of bbox when more than one Marker block contributes " + "geometry, preserving each box rather than collapsing them into a " + "single misleading region. Mutually exclusive with bbox." + ), + ) + polygon: Polygon | None = Field( + default=None, + description="Mirrors bbox's optionality logic.", + ) + reading_order_index: int = Field( + ge=0, + description=( + "The object's position in the document's global linear reading " + "order (Spec Section 3.4), recomputed by the Normalizer from final " + "tree position -- never copied from a Marker id's trailing index " + "number, which was empirically confirmed non-monotonic with true " + "reading order." + ), + ) + section_path: list[str] = Field( + default_factory=list, + description=( + "The chain of governing SectionHeader Marker-block ids, ordered " + "outermost to innermost, derived from Marker's own " + "section_hierarchy map (Spec Section 3.5). Empty only for objects " + "outside any section (e.g. a journal wrapper page's Picture)." + ), + ) + + @model_validator(mode="after") + def _check_bbox_xor_contributing_bboxes(self) -> "StructuralProvenance": + if self.bbox is not None and self.contributing_bboxes is not None: + raise ValueError( + "StructuralProvenance may not set both 'bbox' and " + "'contributing_bboxes' -- exactly one geometric claim about " + "this object's origin is permitted, or neither when no " + "recoverable geometry exists." + ) + return self \ No newline at end of file diff --git a/src/betydb_extraction/document/reference.py b/src/betydb_extraction/document/reference.py new file mode 100644 index 0000000..4871410 --- /dev/null +++ b/src/betydb_extraction/document/reference.py @@ -0,0 +1,51 @@ +"""The Reference model. + +Implements the Document Schema Specification v1.1, Section 16 +("Reference (Bibliography Entry)"). +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +from betydb_extraction.document.enums import NodeKind +from betydb_extraction.document.identifiers import validate_object_id_shape +from betydb_extraction.document.provenance import StructuralProvenance + +__all__ = ["Reference"] + + +class Reference(BaseModel): + """One bibliography entry.""" + + model_config = ConfigDict( + frozen=True, + extra="forbid", + ) + + kind: Literal[NodeKind.REFERENCE] = Field( + default=NodeKind.REFERENCE, + description="Discriminator for Section.children.", + ) + + id: str = Field( + description="Deterministic identifier, per Spec Section 2." + ) + + provenance: StructuralProvenance = Field( + description="marker_block_ids = [the ListItem block's id]." + ) + + raw_text: str = Field( + description=( + "Verbatim reference entry text, including any inline markup " + "Marker preserved." + ) + ) + + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) \ No newline at end of file diff --git a/src/betydb_extraction/document/section.py b/src/betydb_extraction/document/section.py new file mode 100644 index 0000000..e63edce --- /dev/null +++ b/src/betydb_extraction/document/section.py @@ -0,0 +1,99 @@ +"""The Section model. + +Implements the Document Schema Specification, Section 9 ("Section"): a +heading-governed grouping of content, derived from Marker's +``section_hierarchy`` breadcrumbs rather than re-derived from text +pattern-matching on headings. +""" + +from __future__ import annotations + +from typing import Annotated, Literal, Union + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +from betydb_extraction.document.enums import NodeKind +from betydb_extraction.document.equation import Equation +from betydb_extraction.document.figure import Figure +from betydb_extraction.document.footnote import Footnote +from betydb_extraction.document.identifiers import validate_object_id_shape +from betydb_extraction.document.paragraph import Paragraph +from betydb_extraction.document.provenance import StructuralProvenance +from betydb_extraction.document.table import Table +from betydb_extraction.document.reference import Reference + +__all__ = ["Section", "SectionChild"] + + +class Section(BaseModel): + """A heading-governed grouping of content. + + Spec Section 9. A ``Section`` may contain further ``Section`` objects + (genuine nesting, not a flat list with a depth integer alone). + + Per Spec Section 9's closing note: a Marker ``SectionHeader`` block + that is really a table/figure caption label (e.g. one containing only + "Table 3") does not become a ``Section`` -- it is consumed into that + Table's/Figure's ``Caption`` instead. That disambiguation is + Normalizer business logic, out of scope here; this model simply + represents the outcome: a ``Section`` always corresponds to a genuine + paper section heading. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + kind: Literal[NodeKind.SECTION] = Field( + default=NodeKind.SECTION, + description="Discriminator for Page/Section children unions.", + ) + id: str = Field(description="Deterministic identifier, per Spec Section 2.") + heading_text: str = Field( + description="Verbatim text of the governing SectionHeader block." + ) + provenance: StructuralProvenance = Field( + description="marker_block_ids = [the SectionHeader block's id]." + ) + depth: int = Field( + ge=0, + description=( + "Position of this section's heading in the ordered section_path " + "list, zero-indexed from the outermost heading on the " + "page/document." + ), + ) + children: list["SectionChild"] = Field( + default_factory=list, + description=( + "Nested sub-sections and content governed by this heading, in " + "reading order." + ), + ) + + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) + + +SectionChild = Annotated[ + Union[ + Section, + Paragraph, + Table, + Figure, + Equation, + Footnote, + Reference, + ], + Field(discriminator="kind"), +] +"""The discriminated union of types that may appear in ``Section.children``. + +Per Document Schema Specification v1.1. Reference is now a valid child +of Section, representing bibliography entries inside a References +section. Deliberately excludes PageHeader and +PageFooter, which are Page-only per Spec Section 8 -- running header/footer +content does not occur nested inside a paper Section. +""" + +Section.model_rebuild() \ No newline at end of file diff --git a/src/betydb_extraction/document/statistics.py b/src/betydb_extraction/document/statistics.py new file mode 100644 index 0000000..4c87e35 --- /dev/null +++ b/src/betydb_extraction/document/statistics.py @@ -0,0 +1,73 @@ +"""The Statistics model. + +Implements the Document Schema Specification, Section 7 ("Statistics"): +aggregate counts over the final Document Object tree. + +Per Version 1.1's amendment to this section's `reference_count` row, +`Reference` objects are now reachable as `Section.children` members under +a "References" `Section` (Spec Section 9), so `reference_count` is a true +traversal count rather than a count over an out-of-tree collection. +""" +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict, Field, model_validator + +__all__ = ["Statistics"] + + +class Statistics(BaseModel): + """Aggregate counts over the final Document Object tree. + + Spec Section 7. Every field here is, in principle, derivable by + traversing the tree; the object exists as an explicit, serializable + snapshot for logging/comparison across Normalizer runs without + re-parsing the whole tree, and so that + ``unresolved_footnote_count`` -- which operationalizes the + footnote-attachment heuristic's success rate (empirical finding 3.2) + -- has a queryable home. + + This model does not compute its own fields from a tree; it only + validates the counts it is given. Computing them from an actual + ``Document`` tree is Normalizer business logic, out of scope for this + structural layer (consistent with every other deferred-computation + field elsewhere in this schema). + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + page_count: int = Field(ge=0, description="len(pages).") + section_count: int = Field( + ge=0, description="Total Section objects across the document." + ) + paragraph_count: int = Field(ge=0, description="Total Paragraph objects.") + table_count: int = Field(ge=0, description="Total Table objects.") + figure_count: int = Field(ge=0, description="Total Figure objects.") + equation_count: int = Field(ge=0, description="Total Equation objects.") + footnote_count: int = Field(ge=0, description="Total Footnote objects.") + reference_count: int = Field( + ge=0, + description=( + "Total Reference objects. As of Version 1.1, Reference objects " + "are reachable as Section.children members under a References " + "Section, so this is a true traversal count." + ), + ) + unresolved_footnote_count: int = Field( + ge=0, + description=( + "Footnotes whose attached_object_id is None after Normalizer " + "processing -- a direct, queryable signal of how much of the " + "geometric-attachment heuristic (empirical finding 3.2) " + "succeeded on this paper." + ), + ) + + @model_validator(mode="after") + def _check_unresolved_does_not_exceed_total(self) -> "Statistics": + if self.unresolved_footnote_count > self.footnote_count: + raise ValueError( + "Statistics.unresolved_footnote_count " + f"({self.unresolved_footnote_count}) cannot exceed " + f"footnote_count ({self.footnote_count})." + ) + return self \ No newline at end of file diff --git a/src/betydb_extraction/document/table.py b/src/betydb_extraction/document/table.py new file mode 100644 index 0000000..126d099 --- /dev/null +++ b/src/betydb_extraction/document/table.py @@ -0,0 +1,199 @@ +"""Table models: logical structure and evidence-only cell geometry. + +Implements the Document Schema Specification, Section 12 ("Table") and +its three supporting types: ``TableRow`` (12.1), ``TableRowCell`` (12.2), +and the evidence-only ``TableCell`` (12.3). + +Two deliberately separate cell representations exist side by side, per +the spec's own justification: Marker's ```` HTML has correct +logical row/column structure but no per-cell geometry, while Marker's +flat ``TableCell`` children have per-cell geometry but no row/column +index. Rather than fabricate one from the other, both are preserved as +Marker actually provides them. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +from betydb_extraction.document.caption import Caption +from betydb_extraction.document.enums import NodeKind +from betydb_extraction.document.identifiers import validate_object_id_shape +from betydb_extraction.document.provenance import ( + BoundingBox, + Polygon, + StructuralProvenance, +) + +__all__ = ["Table", "TableCell", "TableRow", "TableRowCell"] + + +class TableRowCell(BaseModel): + """One logical cell within a parsed table row. + + Spec Section 12.2. ``text`` has any ```` wrapper tag stripped, + its content treated as equivalent plain text -- Marker was observed + to inconsistently wrap numerically identical ``mean +/- stderr`` + values in ```` depending on which OCR path produced that cell, + an artifact carrying no structural meaning. + + Does not carry ``row_index``/``col_index`` integers: a cell's row and + column are already implicit in ``Table.rows``' list-of-lists + structure (the cell's containing ``TableRow``'s position, and the + cell's own position within that row), so redundant integer fields + would duplicate information already present in list order. + + Does not carry ``rowspan``/``colspan``: Marker's HTML output was + empirically confirmed to never emit these attributes, even where the + source PDF visually has merged cells -- it flattens instead. Adding + fields for a case never observed in Marker's actual output would be + a speculative field, which the specification explicitly disallows. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + text: str = Field( + description=( + "Cell text content from the parsed / element, with any " + " wrapper tag stripped and its content treated as plain text." + ) + ) + is_header: bool = Field( + description="True if the source element was , false for ." + ) + structural_notes: str | None = Field( + default=None, + description=( + "A free-text slot reserved for a Normalizer-attached structural " + "annotation, most notably a suspected merged-cell placeholder " + "(Marker silently flattens merged header cells into duplicated " + "rows with an empty filler cell, with no flag distinguishing this " + "from a genuinely empty cell). The heuristic for populating this " + "field is explicitly not decided by the specification -- it is an " + "open slot reserved so that decision can be made later without a " + "schema change." + ), + ) + + +class TableRow(BaseModel): + """One logical row within a table's parsed structure. + + Spec Section 12.1. ``cells`` is constrained to a minimum length of + one: per Spec Section 20's validation rule ("if rows is non-empty, + every TableRow.cells list has at least one element"), a row with + zero cells is not a meaningful row and would indicate a parse error + in the source HTML upstream, which should surface as a Normalizer-time + error rather than a silently-accepted empty row reaching the Document + Object. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + cells: list[TableRowCell] = Field( + min_length=1, + description="Ordered left to right per the source
element.", + ) + + +class TableCell(BaseModel): + """One Marker TableCell block, retained as evidence/geometry only. + + Spec Section 12.3. This is *not* the logical cell representation + (that is ``TableRowCell``, embedded in ``Table.rows``) -- this object + exists solely so that a specific cell's bounding box can be + highlighted as evidence in a future Scientist Review interface. + ``text`` here is verbatim, not math-stripped, because this object is + evidence/geometry rather than the logical text consumers should read. + + The specification does not assert a guaranteed positional + correspondence between a given ``TableRowCell`` and a given + ``TableCell`` (e.g. for highlighting the geometry of a specific + logical cell) -- any such correlation is an unverified Normalizer-time + heuristic, out of scope for this model. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + id: str = Field(description="Deterministic identifier, per Spec Section 2.") + text: str = Field(description="Verbatim Marker TableCell content.") + bbox: BoundingBox = Field(description="Per-cell geometry.") + polygon: Polygon = Field(description="Mirrors bbox.") + + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) + + +class Table(BaseModel): + """A table's logical structure and evidence-level cell geometry. + + Spec Section 12. ``raw_html`` is treated as the source of truth for + logical structure (rows, columns, header rows) -- ``rows`` is a + structured parse derived from it, not an independent reconstruction, + precisely because reconstructing structure independently from cell + geometry risks disagreeing with Marker's own already-correct HTML + parse. ``cells`` is retained purely as supplementary evidence/geometry + data and is never used to derive ``rows``. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + kind: Literal[NodeKind.TABLE] = Field( + default=NodeKind.TABLE, + description="Discriminator for Page/Section children unions.", + ) + id: str = Field(description="Deterministic identifier, per Spec Section 2.") + provenance: StructuralProvenance = Field( + description=( + "marker_block_ids = [the Table block's id] (and the TableGroup " + "id too, if Pattern A)." + ) + ) + caption: Caption | None = Field( + default=None, + description=( + "None only if no caption-bearing blocks were found adjacent to " + "the table at all -- not empirically observed in the " + "representative paper, but not assumed impossible." + ), + ) + raw_html: str = Field( + description=( + "The Table block's own html field, verbatim -- the complete, " + "correctly-nested
...
Marker produces. The source " + "of truth for logical structure." + ) + ) + rows: list[TableRow] = Field( + default_factory=list, + description=( + "A structured parse of raw_html's elements into row " + "objects, derived from raw_html, not an independent " + "reconstruction. May be empty." + ), + ) + cells: list[TableCell] = Field( + default_factory=list, + description=( + "The flat list of Marker TableCell child blocks, retained only " + "as evidence/geometry data. May be empty." + ), + ) + footnote_ids: list[str] = Field( + default_factory=list, + description=( + "Ids of Footnote objects geometrically attached to this table. " + "Empty until the Normalizer's bbox-proximity heuristic runs; the " + "field exists now so that heuristic's output has a defined home " + "without a later schema change." + ), + ) + + @field_validator("id") + @classmethod + def _check_id_shape(cls, value: str) -> str: + return validate_object_id_shape(value) \ No newline at end of file diff --git a/src/betydb_extraction/marker_adapter/__init__.py b/src/betydb_extraction/marker_adapter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/betydb_extraction/marker_adapter/raw_model.py b/src/betydb_extraction/marker_adapter/raw_model.py new file mode 100644 index 0000000..e6419a5 --- /dev/null +++ b/src/betydb_extraction/marker_adapter/raw_model.py @@ -0,0 +1,418 @@ +""" +Raw Marker Model. + +This module defines the Raw Marker Model: a lossless, immutable, structurally +faithful mirror of the JSON tree produced by Marker (https://github.com/VikParuchuri/marker). + +Architectural role +------------------- +The Raw Marker Model sits directly beneath the Normalizer in the pipeline: + + PDF -> Marker -> Raw Marker Model -> Normalizer -> Document Object -> ... + +Its sole purpose is reproducibility, debugging, schema insulation from future +Marker version changes, and complete preservation of the original parser +output. It performs NO cleaning, NO normalization, NO restructuring, and NO +semantic or type-specific interpretation. + +Design decisions (deliberate, see inline notes for rationale) +--------------------------------------------------------------- +1. Single uniform block shape (`MarkerBlock`), not a discriminated union per + `block_type`. Empirically, every node Marker emits -- "Document", "Page", + "Table", "TableCell", "Text", "Footnote", etc. -- shares one identical + field envelope: id, block_type, html, polygon, bbox, children, + section_hierarchy, images. Modeling this as a discriminated union would + require the Raw layer to "know" what each block_type means, which is + exactly the interpretation step that belongs to the Normalizer, not here. + A new Marker block_type we've never seen must still parse successfully + into this model with zero code changes -- that is the whole point of a + lossless mirror. + +2. Fully immutable (`frozen=True`), and recursively so: every nested + `MarkerBlock` in `children` is itself frozen. The Raw Marker Model must + never be modified once constructed; only the Normalizer may *read* it to + produce a separate Document Object. + +3. No silent data coercion that could lose information. Fields Marker may + omit or set to null (`children`, `images`) are modeled as `Optional` with + `None` defaults rather than being defaulted to empty containers, because + "absent" and "empty" are observably different states in real Marker output + and collapsing them would already be a (small) normalization decision -- + which is explicitly out of scope for this layer. + +4. Extra/unknown fields are preserved, not dropped. Marker's schema may grow + new fields in future versions. Per the "schema insulation from future + Marker updates" goal, this model is configured to retain any field it + does not explicitly know about (via `model_config['extra'] = "allow"`) + so that upgrading Marker never silently discards data even before the + Pydantic models are updated to formally recognize a new field. + +5. No deterministic-id derivation happens here. Marker's own `id` strings + (e.g. "/page/7/Table/2") are preserved verbatim as provenance. Deriving + the Document Object's *own* stable identifiers from these is a Normalizer + responsibility (see `betydb_extraction.document.identifiers`), because + identifier derivation is itself a form of interpretation. + +Empirical grounding +-------------------- +This model was derived by direct inspection of Marker's output for a +representative paper (Smukler et al. 2012, Journal of Soil and Water +Conservation). See `marker_empirical_findings_paper1.md` for the full +findings. Key confirmed facts encoded here: + +- Every block (container or leaf) shares the exact same field set: id, + block_type, html, polygon, bbox, children, section_hierarchy, images. +- Leaf blocks have `children = None` and real inline HTML content in `html`. +- Container blocks have a non-None `children` list and an `html` field that + is itself just a manifest of `` pointers -- i.e. + for containers, `html` is redundant with `children` and should not be + treated as primary content. +- `images` is a dict keyed by the block's own id, populated only for blocks + that embed raster image data (observed: `Picture` blocks); it is `{}` for + the overwhelming majority of blocks and `None` was not observed in this + paper, but is modeled as possible for robustness. +- `section_hierarchy` is a dict mapping a depth-index string to the id of + the governing SectionHeader block at that depth. +- `Span` and `Line`, present in Marker's separate `page_stats` summary, do + NOT appear as nodes in the actual tree; they are absorbed into parent + `html`. This model does not need a representation for them. +- The root `Document` block omits `id`, `html`, `polygon`, `bbox`, and + `section_hierarchy` entirely -- in the raw JSON it has only `children` + and `block_type`. This model accommodates that by making every field + other than `block_type` optional (with `None`/empty defaults), rather + than forcing a synthetic value onto a node that genuinely carries none. +""" + +from __future__ import annotations + +from typing import Any, Optional + +from pydantic import BaseModel, ConfigDict, Field, field_validator + + +class MarkerPolygonPoint(BaseModel): + """ + A single [x, y] corner point of a block's bounding polygon, in Marker's + native page coordinate space (origin top-left, units are PDF points as + emitted by Marker -- Marker does not normalize these to 0..1). + + Modeled as a named structure rather than a bare ``tuple[float, float]`` + so that round-trip JSON schema generation is self-describing and so + that future fields (e.g. a confidence score per point) could be added + without breaking the shape. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + x: float + y: float + + @classmethod + def from_pair(cls, pair: "list[float] | tuple[float, float]") -> "MarkerPolygonPoint": + """Construct from Marker's raw ``[x, y]`` list/tuple representation.""" + x, y = pair + return cls(x=x, y=y) + + def to_pair(self) -> list[float]: + """Serialize back to Marker's raw ``[x, y]`` list representation.""" + return [self.x, self.y] + + +class MarkerBBox(BaseModel): + """ + Axis-aligned bounding box in Marker's native page coordinate space, as + the flat 4-tuple ``[x0, y0, x1, y1]`` Marker emits alongside (and + derivable from, but not always numerically identical to, due to + floating point and polygon-vs-bbox rounding) the polygon. + + Both ``polygon`` and ``bbox`` are preserved on every block because they + are both present, independently, in real Marker output, and this layer + does not assume one is redundant with the other. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + x0: float + y0: float + x1: float + y1: float + + @classmethod + def from_list(cls, values: "list[float]") -> "MarkerBBox": + """Construct from Marker's raw ``[x0, y0, x1, y1]`` list representation.""" + x0, y0, x1, y1 = values + return cls(x0=x0, y0=y0, x1=x1, y1=y1) + + def to_list(self) -> list[float]: + """Serialize back to Marker's raw ``[x0, y0, x1, y1]`` list representation.""" + return [self.x0, self.y0, self.x1, self.y1] + + +class MarkerBlock(BaseModel): + """ + A single node in Marker's output tree. + + This is intentionally the ONLY block model in the Raw Marker layer. + Every block Marker can produce -- "Document", "Page", "Table", + "TableCell", "Text", "SectionHeader", "Footnote", "Caption", "Figure", + "FigureGroup", "TableGroup", "Picture", "ListGroup", "ListItem", + "PageHeader", "PageFooter", "Equation", and any block_type introduced by + a future Marker version -- is represented by this same shape. See the + module docstring, point 1, for the rationale. + + Interpreting what a given ``block_type`` *means* (e.g. "a TableGroup's + children are always exactly [Caption, Table]") is explicitly a + Normalizer concern and must not be encoded as validation logic on this + model. This model only guarantees that the shape Marker actually emits + parses losslessly; it makes no claims about the semantics of any + particular block_type. + """ + + model_config = ConfigDict( + frozen=True, + # Marker's schema may evolve. Unknown fields are preserved rather + # than silently dropped, so that upgrading Marker never causes + # silent data loss even before this model is updated to formally + # recognize a new field. See module docstring, point 4. + extra="allow", + ) + + id: Optional[str] = Field( + default=None, + description=( + "Marker's own block identifier, e.g. '/page/7/Table/2'. This is " + "a path-like string encoding page index, block_type, and a " + "local positional index. It is preserved verbatim and is NOT " + "guaranteed to be globally stable across different Marker " + "versions or runs -- treat it as provenance to the specific " + "Marker invocation that produced this tree, not as a permanent " + "cross-run identifier. Permanent identifiers are derived later, " + "in the Document Object layer. Modeled as Optional because the " + "root 'Document' block in observed Marker output omits this " + "field entirely (along with html, polygon, bbox, and " + "section_hierarchy) -- it is structurally a bare wrapper " + "around the page children and carries no positional identity " + "of its own." + ), + ) + + block_type: str = Field( + ..., + description=( + "Marker's block type tag, e.g. 'Page', 'Table', 'TableCell', " + "'Text', 'SectionHeader', 'Footnote', 'Caption', 'Figure', " + "'FigureGroup', 'TableGroup', 'Picture', 'ListGroup', " + "'ListItem', 'PageHeader', 'PageFooter', 'Equation', " + "'Document'. Modeled as a plain ``str`` rather than an ``Enum`` " + "or discriminator so that an unrecognized block_type from a " + "future Marker version still parses successfully." + ), + ) + + html: str = Field( + default="", + description=( + "For leaf blocks: the actual inline HTML content of this block " + "(e.g. '

...

'). For container blocks: " + "a manifest of '' " + "pointers to this block's children, in reading order -- in " + "that case this field is redundant with `children` and should " + "be treated as a reading-order hint only, not primary content. " + "Distinguishing these two cases is a Normalizer responsibility " + "(in practice: `children is None` implies the html is real " + "leaf content; `children is not None` implies it is a " + "content-ref manifest), not something this model decides." + ), + ) + + polygon: Optional[list[MarkerPolygonPoint]] = Field( + default=None, + description=( + "The block's bounding polygon as a list of corner points, in " + "Marker's native page coordinate space. Observed in practice as " + "4 points (a rectangle) but modeled as a list of arbitrary " + "length since Marker's polygon format is not contractually " + "limited to 4 points." + ), + ) + + bbox: Optional[MarkerBBox] = Field( + default=None, + description=( + "The block's axis-aligned bounding box [x0, y0, x1, y1] in " + "Marker's native page coordinate space." + ), + ) + + children: Optional[list["MarkerBlock"]] = Field( + default=None, + description=( + "Nested child blocks, in reading order, or `None` for a true " + "leaf block. `None` and `[]` are deliberately NOT collapsed " + "into one representation -- in observed Marker output, leaf " + "blocks have `children: None`, never `children: []`; preserving " + "this distinction exactly as Marker emits it is part of this " + "layer's lossless mandate." + ), + ) + + section_hierarchy: dict[str, str] = Field( + default_factory=dict, + description=( + "A mapping from depth-index string (e.g. '1', '4') to the " + "Marker block id of the governing SectionHeader at that depth, " + "as emitted by Marker for this specific block. This is a live " + "breadcrumb of the heading path above this block at the time " + "Marker produced the tree." + ), + ) + + images: Optional[dict[str, str]] = Field( + default=None, + description=( + "A mapping from (typically this block's own) Marker id to a " + "base64-encoded image payload. Populated for blocks that embed " + "raster image data (observed: 'Picture' blocks); `{}` for the " + "large majority of blocks that carry no image data. Modeled as " + "Optional rather than defaulting to `{}` because Marker's own " + "output for this field was observed to vary between `{}` and " + "(for Page-level container blocks) `None` is not ruled out by " + "the schema even though `{}` was the only empty case directly " + "observed in this paper's output -- see point 3 in the module " + "docstring on not collapsing absent vs. empty." + ), + ) + + @field_validator("polygon", mode="before") + @classmethod + def _coerce_polygon(cls, value: Any) -> Any: + """ + Accept Marker's native raw polygon representation -- a list of + ``[x, y]`` pairs, e.g. ``[[0.0, 0.0], [635.0, 0.0], ...]`` -- and + coerce each pair into the structured ``MarkerPolygonPoint`` shape + this model declares. + + This is intentionally a pure, lossless reshape (list-of-lists -> + list-of-named-points) and not a normalization: no coordinate + values are altered, reordered, deduplicated, or interpreted. It + exists only because Marker emits bare ``[x, y]`` pairs on the + wire, while this model represents each point as a named structure + for self-describing JSON schema. Already-structured input (e.g. + when round-tripping a previously-validated model back through + validation) is passed through unchanged. + """ + if value is None: + return value + coerced = [] + for point in value: + if isinstance(point, MarkerPolygonPoint): + coerced.append(point) + elif isinstance(point, dict): + coerced.append(point) + else: + # Raw [x, y] list/tuple as emitted by Marker. + coerced.append(MarkerPolygonPoint.from_pair(point)) + return coerced + + @field_validator("bbox", mode="before") + @classmethod + def _coerce_bbox(cls, value: Any) -> Any: + """ + Accept Marker's native raw bbox representation -- a flat + ``[x0, y0, x1, y1]`` list -- and coerce it into the structured + ``MarkerBBox`` shape this model declares. + + Same rationale as ``_coerce_polygon`` above: a lossless reshape of + the wire format into a self-describing structure, not a + normalization of the underlying values. + """ + if value is None: + return value + if isinstance(value, MarkerBBox) or isinstance(value, dict): + return value + return MarkerBBox.from_list(value) + + def is_leaf(self) -> bool: + """ + Return True if this block is a leaf (``children is None``). + + This is a pure, non-mutating read of the block's own shape -- it is + provided as a convenience for callers (e.g. the future Normalizer) + and performs no interpretation of *what kind* of leaf this is. + """ + return self.children is None + + def iter_descendants(self) -> "list[MarkerBlock]": + """ + Return all descendant blocks (not including self) in document order, + via a depth-first pre-order traversal. + + This is read-only tree traversal, not normalization: it does not + interpret block_type semantics, does not deduplicate, and does not + reorder anything relative to Marker's own ``children`` ordering. + """ + result: list[MarkerBlock] = [] + for child in self.children or []: + result.append(child) + result.extend(child.iter_descendants()) + return result + + +class MarkerDocument(BaseModel): + """ + The root of a single Marker output tree for one processed PDF. + + Empirically, Marker's top-level JSON output is itself just a + ``MarkerBlock``-shaped object with ``block_type == "Document"`` and a + ``children`` list of ``Page``-typed blocks -- it does not introduce any + additional top-level fields beyond what ``MarkerBlock`` already models. + This wrapper type exists for two reasons rather than simply using + ``MarkerBlock`` directly as the root: + + 1. To give the root of the tree a distinct, intention-revealing type at + the API boundary (the Marker adapter's parse function returns a + ``MarkerDocument``, not a bare ``MarkerBlock``, making misuse -- e.g. + accidentally passing a non-root block where a full document is + expected -- a type error rather than a silent bug). + 2. To provide a seam for any future top-level metadata Marker may add + outside the block tree itself (e.g. Marker version, processing + timestamp) without having to retrofit the core ``MarkerBlock`` model. + + No semantic validation (e.g. "root must have block_type == 'Document'") + is enforced here beyond what is structurally true of the input, in + keeping with this layer's mandate to mirror Marker losslessly rather + than police its output. + """ + + model_config = ConfigDict(frozen=True, extra="allow") + + root: MarkerBlock = Field( + ..., + description=( + "The root block of Marker's output tree for this document " + "(observed block_type: 'Document')." + ), + ) + + source_marker_json_path: Optional[str] = Field( + default=None, + description=( + "Optional filesystem path or identifier of the raw Marker JSON " + "file this object was parsed from, retained purely for " + "debugging and provenance traceability. Not part of Marker's " + "own output -- populated by the adapter at parse time." + ), + ) + + @property + def pages(self) -> list[MarkerBlock]: + """ + Convenience accessor for the root's direct children, which are + empirically always the per-page blocks (block_type == 'Page'). + + This performs no filtering or validation of block_type -- it simply + returns ``root.children`` (or an empty list if somehow absent), in + original order. Any assumption about *which* block_types are + present belongs to the Normalizer, not here. + """ + return self.root.children or [] diff --git a/tests/document/__init__.py b/tests/document/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/document/conftest.py b/tests/document/conftest.py new file mode 100644 index 0000000..c18cbbd --- /dev/null +++ b/tests/document/conftest.py @@ -0,0 +1,228 @@ +"""Shared fixtures and small construction helpers for the Document Object +test suite. + +These helpers build minimal-but-valid instances of each model so that +individual tests can focus on the one property under test rather than +re-deriving boilerplate. They construct objects directly from the public +API (never by parsing real Marker output -- that is the Normalizer's job, +out of scope here). +""" +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest + +from betydb_extraction.document import ( + BoundingBox, + Caption, + Document, + Equation, + Figure, + Footnote, + Metadata, + NodeKind, + Page, + PageFooter, + PageHeader, + Paragraph, + Polygon, + ProcessingMetadata, + Reference, + Section, + Statistics, + StructuralProvenance, + Table, + TableCell, + TableRow, + TableRowCell, + compute_document_id, + compute_object_id, +) + +TEST_DOCUMENT_ID = compute_document_id("10.1000/test-doi") + + +def make_provenance( + canonical_path: str = "/page/0/Text/0", + *, + reading_order_index: int = 0, + page_number: int = 0, + bbox: BoundingBox | None = None, + section_path: list[str] | None = None, +) -> StructuralProvenance: + """Build a minimal valid StructuralProvenance for test fixtures.""" + return StructuralProvenance( + marker_block_ids=[canonical_path], + page_number=page_number, + bbox=bbox, + reading_order_index=reading_order_index, + section_path=section_path or [], + ) + + +def make_id(canonical_path: str) -> str: + """Compute a deterministic object id rooted at TEST_DOCUMENT_ID.""" + return compute_object_id(TEST_DOCUMENT_ID, canonical_path) + + +def make_paragraph(path: str = "/page/0/Text/0", text: str = "Body text.") -> Paragraph: + return Paragraph( + id=make_id(path), + text=text, + provenance=make_provenance(path), + ) + + +def make_footnote(path: str = "/page/0/Footnote/0") -> Footnote: + return Footnote( + id=make_id(path), + provenance=make_provenance(path), + raw_text="1. See methods for details.", + ) + + +def make_reference(path: str = "/page/9/ListItem/0") -> Reference: + return Reference( + id=make_id(path), + provenance=make_provenance(path), + raw_text="Smukler, S. et al. 2012. Nutrient cycling. J. Agron.", + ) + + +def make_equation(path: str = "/page/3/Equation/0") -> Equation: + return Equation( + id=make_id(path), + provenance=make_provenance(path), + raw_math="y = mx + b ... (1)", + ) + + +def make_caption(path: str = "/page/6/Caption/0") -> Caption: + return Caption( + label="Table 3", + text="Nutrient flux by treatment.", + provenance=make_provenance(path), + ) + + +def make_table(path: str = "/page/6/Table/0") -> Table: + return Table( + id=make_id(path), + provenance=make_provenance(path), + caption=make_caption(), + raw_html="
1.2
", + rows=[TableRow(cells=[TableRowCell(text="1.2", is_header=False)])], + cells=[ + TableCell( + id=make_id(path + "/cell/0"), + text="1.2", + bbox=BoundingBox(x0=0, y0=0, x1=10, y1=10), + polygon=Polygon( + points=((0, 0), (10, 0), (10, 10), (0, 10)) + ), + ) + ], + ) + + +def make_figure(path: str = "/page/7/Figure/0") -> Figure: + return Figure( + id=make_id(path), + provenance=make_provenance(path), + caption=make_caption(path + "/Caption/0"), + ) + + +def make_page_header(path: str = "/page/0/PageHeader/0") -> PageHeader: + return PageHeader( + id=make_id(path), + provenance=make_provenance(path), + raw_text="J. Agronomic Studies", + ) + + +def make_page_footer(path: str = "/page/0/PageFooter/0") -> PageFooter: + return PageFooter( + id=make_id(path), + provenance=make_provenance(path), + raw_text="Page 1 of 12", + ) + + +def make_section( + path: str = "/page/0/SectionHeader/0", + *, + heading_text: str = "Methods", + depth: int = 0, + children: list | None = None, +) -> Section: + return Section( + id=make_id(path), + heading_text=heading_text, + provenance=make_provenance(path), + depth=depth, + children=children or [], + ) + + +def make_page( + page_number: int = 0, + *, + children: list | None = None, + is_front_matter: bool = False, +) -> Page: + path = f"/page/{page_number}" + return Page( + id=make_id(path), + page_number=page_number, + provenance=make_provenance(path, page_number=page_number), + children=children or [], + is_front_matter=is_front_matter, + ) + + +def make_statistics(**overrides) -> Statistics: + base = dict( + page_count=1, + section_count=1, + paragraph_count=1, + table_count=0, + figure_count=0, + equation_count=0, + footnote_count=0, + reference_count=0, + unresolved_footnote_count=0, + ) + base.update(overrides) + return Statistics(**base) + + +def make_metadata(**overrides) -> Metadata: + base = dict(title="A Paper", page_count=1, has_front_matter_page=False) + base.update(overrides) + return Metadata(**base) + + +def make_processing_metadata(**overrides) -> ProcessingMetadata: + base = dict( + marker_version="1.2.3", + normalizer_version="0.1.0", + processed_at=datetime(2026, 6, 17, 12, 0, 0, tzinfo=timezone.utc), + source_marker_artifact_ref="artifacts/smukler_2012.marker.json", + ) + base.update(overrides) + return ProcessingMetadata(**base) + + +def make_document(**overrides) -> Document: + base = dict( + id=TEST_DOCUMENT_ID, + source_pdf_identifier="10.1000/test-doi", + metadata=make_metadata(), + processing_metadata=make_processing_metadata(), + statistics=make_statistics(), + pages=[make_page(0, children=[make_paragraph()])], + ) + base.update(overrides) + return Document(**base) \ No newline at end of file diff --git a/tests/document/test_document.py b/tests/document/test_document.py new file mode 100644 index 0000000..f6ef4f7 --- /dev/null +++ b/tests/document/test_document.py @@ -0,0 +1,83 @@ +"""Tests for Document (Spec Section 4): the root container.""" +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from betydb_extraction.document import Document, StructuralProvenance + +from .conftest import make_document, make_page, make_paragraph + + +class TestDocument: + def test_minimal_valid_construction(self): + doc = make_document() + assert len(doc.pages) == 1 + + def test_pages_must_be_non_empty(self): + with pytest.raises(ValidationError): + make_document(pages=[]) + + def test_duplicate_page_numbers_rejected(self): + with pytest.raises(ValidationError): + make_document( + pages=[ + make_page(0, children=[make_paragraph()]), + make_page(0, children=[make_paragraph("/page/0/Text/1")]), + ] + ) + + def test_out_of_order_pages_rejected(self): + with pytest.raises(ValidationError): + make_document( + pages=[ + make_page(1, children=[make_paragraph("/page/1/Text/0")]), + make_page(0, children=[make_paragraph("/page/0/Text/0")]), + ] + ) + + def test_ascending_pages_with_gap_is_valid(self): + # Spec: a Marker-side page omission is preserved, not silently + # re-numbered -- a gap in page_number is allowed as long as the + # list itself is still ascending. + doc = make_document( + pages=[ + make_page(0, children=[make_paragraph("/page/0/Text/0")]), + make_page(2, children=[make_paragraph("/page/2/Text/0")]), + ] + ) + assert [p.page_number for p in doc.pages] == [0, 2] + + def test_multiple_ascending_pages_valid(self): + doc = make_document( + pages=[ + make_page(0, children=[make_paragraph("/page/0/Text/0")]), + make_page(1, children=[make_paragraph("/page/1/Text/0")]), + make_page(2, children=[make_paragraph("/page/2/Text/0")]), + ] + ) + assert len(doc.pages) == 3 + + def test_document_has_no_structural_provenance_field(self): + # Spec 4 invariant: Document is the only object with no + # StructuralProvenance of its own. + assert "provenance" not in Document.model_fields + for field in Document.model_fields.values(): + assert field.annotation is not StructuralProvenance + + def test_rejects_malformed_document_id(self): + with pytest.raises(ValidationError): + make_document(id="not-a-document-id") + + def test_rejects_object_id_shape_as_document_id(self): + # A doc: id (object-shaped) must not validate as a Document id. + from betydb_extraction.document.identifiers import compute_object_id + + bad_id = compute_object_id("betydoc:" + "a" * 16, "/page/0") + with pytest.raises(ValidationError): + make_document(id=bad_id) + + def test_is_frozen(self): + doc = make_document() + with pytest.raises(ValidationError): + doc.source_pdf_identifier = "different" \ No newline at end of file diff --git a/tests/document/test_identifiers.py b/tests/document/test_identifiers.py new file mode 100644 index 0000000..b9b1243 --- /dev/null +++ b/tests/document/test_identifiers.py @@ -0,0 +1,111 @@ +"""Tests for betydb_extraction.document.identifiers. + +Verifies Spec Section 2 (Identifier Strategy): determinism, the exact +hashing contract, and shape validation. +""" +from __future__ import annotations + +import pytest + +from betydb_extraction.document.identifiers import ( + compute_document_id, + compute_object_id, + is_valid_document_id, + is_valid_object_id, + validate_document_id_shape, + validate_object_id_shape, +) + + +class TestDeterminism: + def test_document_id_is_deterministic(self): + assert compute_document_id("10.1000/x") == compute_document_id("10.1000/x") + + def test_document_id_differs_for_different_input(self): + assert compute_document_id("10.1000/x") != compute_document_id("10.1000/y") + + def test_object_id_is_deterministic(self): + doc_id = compute_document_id("10.1000/x") + assert compute_object_id(doc_id, "/page/0/Text/0") == compute_object_id( + doc_id, "/page/0/Text/0" + ) + + def test_object_id_differs_by_canonical_path(self): + doc_id = compute_document_id("10.1000/x") + assert compute_object_id(doc_id, "/page/0/Text/0") != compute_object_id( + doc_id, "/page/0/Text/1" + ) + + def test_object_id_differs_by_document_id(self): + doc_a = compute_document_id("10.1000/x") + doc_b = compute_document_id("10.1000/y") + assert compute_object_id(doc_a, "/page/0/Text/0") != compute_object_id( + doc_b, "/page/0/Text/0" + ) + + +class TestShape: + def test_document_id_has_expected_prefix_and_length(self): + value = compute_document_id("10.1000/x") + assert value.startswith("betydoc:") + assert len(value) == len("betydoc:") + 16 + + def test_object_id_has_expected_prefix_and_length(self): + value = compute_object_id(compute_document_id("10.1000/x"), "/page/0") + assert value.startswith("doc:") + assert len(value) == len("doc:") + 16 + + def test_is_valid_document_id_accepts_well_formed(self): + assert is_valid_document_id(compute_document_id("10.1000/x")) + + def test_is_valid_document_id_rejects_object_id(self): + doc_id = compute_document_id("10.1000/x") + obj_id = compute_object_id(doc_id, "/page/0") + assert not is_valid_document_id(obj_id) + + def test_is_valid_object_id_rejects_document_id(self): + doc_id = compute_document_id("10.1000/x") + assert not is_valid_object_id(doc_id) + + @pytest.mark.parametrize( + "bad_value", + [ + "betydoc:short", + "betydoc:" + "g" * 16, # non-hex char + "wrongprefix:" + "a" * 16, + "", + ], + ) + def test_is_valid_document_id_rejects_malformed(self, bad_value): + assert not is_valid_document_id(bad_value) + + @pytest.mark.parametrize( + "bad_value", + [ + "doc:short", + "doc:" + "G" * 16, # uppercase not allowed + "wrongprefix:" + "a" * 16, + "", + ], + ) + def test_is_valid_object_id_rejects_malformed(self, bad_value): + assert not is_valid_object_id(bad_value) + + +class TestRaisingValidators: + def test_validate_document_id_shape_returns_value_when_valid(self): + value = compute_document_id("10.1000/x") + assert validate_document_id_shape(value) == value + + def test_validate_document_id_shape_raises_when_invalid(self): + with pytest.raises(ValueError): + validate_document_id_shape("not-a-valid-id") + + def test_validate_object_id_shape_returns_value_when_valid(self): + doc_id = compute_document_id("10.1000/x") + value = compute_object_id(doc_id, "/page/0") + assert validate_object_id_shape(value) == value + + def test_validate_object_id_shape_raises_when_invalid(self): + with pytest.raises(ValueError): + validate_object_id_shape("not-a-valid-id") \ No newline at end of file diff --git a/tests/document/test_invariants.py b/tests/document/test_invariants.py new file mode 100644 index 0000000..6c26891 --- /dev/null +++ b/tests/document/test_invariants.py @@ -0,0 +1,110 @@ +"""Cross-cutting architectural invariant tests (Spec Section 1), exercised +across the whole tree rather than per-model: determinism (1.3), +immutability (1.1), and extra="forbid" (Section 19) enforcement. +""" +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from betydb_extraction.document import Section + +from .conftest import ( + make_document, + make_page, + make_paragraph, + make_reference, + make_section, + make_table, +) + + +class TestEndToEndDeterminism: + def test_identical_construction_produces_identical_ids(self): + doc_a = make_document() + doc_b = make_document() + assert doc_a.id == doc_b.id + assert doc_a.pages[0].id == doc_b.pages[0].id + + def test_identical_construction_produces_byte_identical_json(self): + doc_a = make_document() + doc_b = make_document() + assert doc_a.model_dump_json() == doc_b.model_dump_json() + + def test_full_tree_with_reference_is_deterministic(self): + def build(): + return make_document( + pages=[ + make_page( + 0, + children=[ + make_section( + "/page/0/SectionHeader/0", + heading_text="References", + children=[make_reference(), make_reference("/page/0/ListItem/1")], + ) + ], + ) + ] + ) + + doc_a, doc_b = build(), build() + assert doc_a.model_dump_json() == doc_b.model_dump_json() + + +class TestImmutabilityDepth: + """Frozen at the top level is necessary but not sufficient; nested + mutable containers (lists) must not allow item-level mutation either, + since Pydantic's frozen=True only freezes attribute assignment, not + list contents. + """ + + def test_top_level_assignment_blocked(self): + section = make_section() + with pytest.raises(ValidationError): + section.depth = 5 + + def test_section_children_list_itself_is_not_swappable(self): + section = make_section(children=[make_paragraph()]) + with pytest.raises(ValidationError): + section.children = [] + + def test_nested_child_object_is_independently_frozen(self): + para = make_paragraph() + section = make_section(children=[para]) + # The child retrieved from the parent is the same frozen model; + # mutating it must still fail. + with pytest.raises(ValidationError): + section.children[0].text = "mutated" + + def test_document_pages_list_itself_is_not_swappable(self): + doc = make_document() + with pytest.raises(ValidationError): + doc.pages = [] + + +class TestExtraFieldsForbidden: + """Spec Section 19: an unexpected extra field indicates a Normalizer + bug and must fail loudly, unlike the Raw Marker Model's extra='allow'. + """ + + def test_section_rejects_unknown_field(self): + with pytest.raises(ValidationError): + Section( + **{ + **make_section().model_dump(), + "unexpected_field": "should not be allowed", + } + ) + + def test_table_rejects_unknown_field(self): + with pytest.raises(ValidationError): + type(make_table())( + **{**make_table().model_dump(), "unexpected_field": "x"} + ) + + def test_document_rejects_unknown_field(self): + with pytest.raises(ValidationError): + type(make_document())( + **{**make_document().model_dump(), "unexpected_field": "x"} + ) \ No newline at end of file diff --git a/tests/document/test_leaf_models.py b/tests/document/test_leaf_models.py new file mode 100644 index 0000000..d33130f --- /dev/null +++ b/tests/document/test_leaf_models.py @@ -0,0 +1,219 @@ +"""Tests for the leaf and supporting content models: Paragraph, Caption, +Table (+ TableRow/TableRowCell/TableCell), Figure, Equation, Footnote, +Reference, PageHeader, PageFooter. +""" +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from betydb_extraction.document import ( + Caption, + Equation, + Figure, + Footnote, + NodeKind, + PageFooter, + PageHeader, + Paragraph, + Reference, + Table, + TableRow, + TableRowCell, +) + +from .conftest import ( + make_caption, + make_equation, + make_footnote, + make_id, + make_page_footer, + make_page_header, + make_paragraph, + make_provenance, + make_reference, + make_table, +) + + +class TestParagraph: + def test_constructs_with_kind_default(self): + p = make_paragraph() + assert p.kind == NodeKind.PARAGRAPH + + def test_preserves_inline_html_verbatim(self): + p = make_paragraph(text="See Table 3 for details.") + assert p.text == "See Table 3 for details." + + def test_rejects_malformed_id(self): + with pytest.raises(ValidationError): + Paragraph(id="not-an-id", text="x", provenance=make_provenance()) + + def test_is_frozen(self): + p = make_paragraph() + with pytest.raises(ValidationError): + p.text = "different" + + +class TestCaption: + def test_all_text_fields_optional(self): + cap = Caption(provenance=make_provenance()) + assert cap.label is None + assert cap.text is None + assert cap.trailing_notes is None + + def test_pattern_a_shape(self): + cap = make_caption() + assert cap.label == "Table 3" + assert cap.text is not None + + def test_has_no_kind_discriminator(self): + # Caption is embedded only, never a children-union member. + assert not hasattr(Caption, "model_fields") or "kind" not in Caption.model_fields + + +class TestTable: + def test_constructs_with_kind_default(self): + t = make_table() + assert t.kind == NodeKind.TABLE + + def test_row_with_zero_cells_rejected(self): + with pytest.raises(ValidationError): + TableRow(cells=[]) + + def test_rows_may_be_empty_list(self): + t = Table( + id=make_id("/page/6/Table/1"), + provenance=make_provenance("/page/6/Table/1"), + raw_html="
", + ) + assert t.rows == [] + + def test_caption_optional(self): + t = Table( + id=make_id("/page/6/Table/2"), + provenance=make_provenance("/page/6/Table/2"), + raw_html="
", + ) + assert t.caption is None + + def test_table_row_cell_math_stripped_text_is_plain_field(self): + cell = TableRowCell(text="1.2 +/- 0.3", is_header=False) + assert cell.text == "1.2 +/- 0.3" + + def test_table_row_cell_has_no_row_col_index_fields(self): + assert "row_index" not in TableRowCell.model_fields + assert "col_index" not in TableRowCell.model_fields + + def test_table_row_cell_has_no_span_fields(self): + assert "rowspan" not in TableRowCell.model_fields + assert "colspan" not in TableRowCell.model_fields + + +class TestFigure: + def test_constructs_with_kind_default(self): + from .conftest import make_figure + + fig = make_figure() + assert fig.kind == NodeKind.FIGURE + + def test_image_data_optional(self): + fig = Figure( + id=make_id("/page/7/Figure/1"), + provenance=make_provenance("/page/7/Figure/1"), + ) + assert fig.image_data is None + + +class TestEquation: + def test_constructs_with_kind_default(self): + eq = make_equation() + assert eq.kind == NodeKind.EQUATION + + def test_equation_number_is_separate_optional_slot(self): + eq = Equation( + id=make_id("/page/3/Equation/1"), + provenance=make_provenance("/page/3/Equation/1"), + raw_math="E = mc^2", + equation_number="2", + ) + assert eq.equation_number == "2" + assert "(2)" not in eq.raw_math # number is a separate field here + + def test_raw_math_can_embed_number_inline(self): + # Per spec: Marker provides no separate field, so raw_math may + # contain the number embedded in the string itself. + eq = Equation( + id=make_id("/page/3/Equation/2"), + provenance=make_provenance("/page/3/Equation/2"), + raw_math="y = mx + b ... (1)", + ) + assert "(1)" in eq.raw_math + assert eq.equation_number is None + + +class TestFootnote: + def test_constructs_with_kind_default(self): + fn = make_footnote() + assert fn.kind == NodeKind.FOOTNOTE + + def test_attached_object_id_defaults_to_none(self): + fn = make_footnote() + assert fn.attached_object_id is None + + def test_unresolved_attachment_does_not_fail_validation(self): + # Spec 20: no validation forces attached_object_id to be set. + fn = Footnote( + id=make_id("/page/0/Footnote/1"), + provenance=make_provenance("/page/0/Footnote/1"), + raw_text="2. Another note.", + attached_object_id=None, + ) + assert fn.attached_object_id is None + + def test_attached_object_id_can_be_set(self): + fn = Footnote( + id=make_id("/page/0/Footnote/2"), + provenance=make_provenance("/page/0/Footnote/2"), + raw_text="3. Yet another.", + attached_object_id=make_id("/page/0/Table/0"), + ) + assert fn.attached_object_id is not None + + +class TestReference: + """Reference per Spec Section 16, Version 1.1 (kind discriminator added).""" + + def test_constructs_with_kind_default(self): + ref = make_reference() + assert ref.kind == NodeKind.REFERENCE + + def test_kind_is_required_in_field_set(self): + assert "kind" in Reference.model_fields + + def test_raw_text_preserved_verbatim(self): + ref = make_reference() + assert "Smukler" in ref.raw_text + + def test_is_frozen(self): + ref = make_reference() + with pytest.raises(ValidationError): + ref.raw_text = "different" + + def test_rejects_malformed_id(self): + with pytest.raises(ValidationError): + Reference(id="bad-id", provenance=make_provenance(), raw_text="x") + + +class TestPageHeaderFooter: + def test_page_header_constructs_with_kind_default(self): + h = make_page_header() + assert h.kind == NodeKind.PAGE_HEADER + + def test_page_footer_constructs_with_kind_default(self): + f = make_page_footer() + assert f.kind == NodeKind.PAGE_FOOTER + + def test_distinct_types_not_merged(self): + assert PageHeader is not PageFooter + assert PageHeader.model_fields["kind"].default != PageFooter.model_fields["kind"].default \ No newline at end of file diff --git a/tests/document/test_metadata_and_statistics.py b/tests/document/test_metadata_and_statistics.py new file mode 100644 index 0000000..fb9af6a --- /dev/null +++ b/tests/document/test_metadata_and_statistics.py @@ -0,0 +1,117 @@ +"""Tests for Statistics (Spec Section 7), Metadata (Spec Section 5), and +ProcessingMetadata (Spec Section 6). +""" +from __future__ import annotations + +from datetime import datetime, timedelta, timezone + +import pytest +from pydantic import ValidationError + +from betydb_extraction.document import Metadata, ProcessingMetadata, Statistics + +from .conftest import make_metadata, make_processing_metadata, make_statistics + + +class TestStatistics: + def test_minimal_valid_construction(self): + stats = make_statistics() + assert stats.page_count == 1 + + def test_all_counts_must_be_non_negative(self): + for field in [ + "page_count", + "section_count", + "paragraph_count", + "table_count", + "figure_count", + "equation_count", + "footnote_count", + "reference_count", + "unresolved_footnote_count", + ]: + with pytest.raises(ValidationError): + make_statistics(**{field: -1}) + + def test_unresolved_cannot_exceed_total_footnotes(self): + with pytest.raises(ValidationError): + make_statistics(footnote_count=2, unresolved_footnote_count=3) + + def test_unresolved_equal_to_total_is_valid(self): + stats = make_statistics(footnote_count=2, unresolved_footnote_count=2) + assert stats.unresolved_footnote_count == 2 + + def test_unresolved_less_than_total_is_valid(self): + stats = make_statistics(footnote_count=5, unresolved_footnote_count=2) + assert stats.unresolved_footnote_count == 2 + + def test_reference_count_field_exists(self): + # Direct regression guard for the original v1.0 omission this + # whole correction was about: Statistics always had this field, + # and it must still be present and independently settable. + stats = make_statistics(reference_count=7) + assert stats.reference_count == 7 + + def test_is_frozen(self): + stats = make_statistics() + with pytest.raises(ValidationError): + stats.page_count = 99 + + +class TestMetadata: + def test_minimal_valid_construction(self): + meta = make_metadata() + assert meta.page_count == 1 + + def test_title_optional(self): + meta = Metadata(page_count=1, has_front_matter_page=False) + assert meta.title is None + + def test_page_count_non_negative(self): + with pytest.raises(ValidationError): + make_metadata(page_count=-1) + + def test_has_no_author_journal_year_doi_fields(self): + # Spec 5's boundary note: these are explicitly NOT modeled here. + for forbidden_field in ["author", "authors", "journal", "year", "doi"]: + assert forbidden_field not in Metadata.model_fields + + def test_is_frozen(self): + meta = make_metadata() + with pytest.raises(ValidationError): + meta.title = "Different Title" + + +class TestProcessingMetadata: + def test_minimal_valid_construction(self): + pm = make_processing_metadata() + assert pm.marker_version == "1.2.3" + + def test_naive_datetime_rejected(self): + with pytest.raises(ValidationError): + make_processing_metadata(processed_at=datetime(2026, 6, 17, 12, 0, 0)) + + def test_non_utc_offset_rejected(self): + offset_tz = timezone(timedelta(hours=5)) + with pytest.raises(ValidationError): + make_processing_metadata( + processed_at=datetime(2026, 6, 17, 12, 0, 0, tzinfo=offset_tz) + ) + + def test_utc_datetime_accepted(self): + pm = make_processing_metadata( + processed_at=datetime(2026, 6, 17, 12, 0, 0, tzinfo=timezone.utc) + ) + assert pm.processed_at.tzinfo is not None + + def test_processed_at_not_part_of_any_id(self): + # Spec 6: processed_at is explicitly excluded from id computation. + # This is really a Document/identifiers-level guarantee, but we + # confirm here that ProcessingMetadata itself exposes no id field + # derived from processed_at. + assert "id" not in ProcessingMetadata.model_fields + + def test_is_frozen(self): + pm = make_processing_metadata() + with pytest.raises(ValidationError): + pm.marker_version = "9.9.9" \ No newline at end of file diff --git a/tests/document/test_provenance.py b/tests/document/test_provenance.py new file mode 100644 index 0000000..9dde752 --- /dev/null +++ b/tests/document/test_provenance.py @@ -0,0 +1,150 @@ +"""Tests for betydb_extraction.document.provenance. + +Verifies Spec Section 3 (Foundational Supporting Types) and the relevant +Section 20 validation rules. +""" +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from betydb_extraction.document.provenance import ( + BoundingBox, + Polygon, + StructuralProvenance, +) + + +class TestBoundingBox: + def test_valid_box_constructs(self): + box = BoundingBox(x0=0, y0=0, x1=10, y1=20) + assert (box.x0, box.y0, box.x1, box.y1) == (0, 0, 10, 20) + + def test_zero_area_box_is_allowed(self): + # Spec 20 only requires x1 >= x0 and y1 >= y0, not strict >. + box = BoundingBox(x0=5, y0=5, x1=5, y1=5) + assert box.x1 == box.x0 + + def test_inverted_x_axis_rejected(self): + with pytest.raises(ValidationError): + BoundingBox(x0=10, y0=0, x1=0, y1=10) + + def test_inverted_y_axis_rejected(self): + with pytest.raises(ValidationError): + BoundingBox(x0=0, y0=10, x1=10, y1=0) + + def test_is_frozen(self): + box = BoundingBox(x0=0, y0=0, x1=1, y1=1) + with pytest.raises(ValidationError): + box.x0 = 5 + + def test_extra_fields_forbidden(self): + with pytest.raises(ValidationError): + BoundingBox(x0=0, y0=0, x1=1, y1=1, z0=0) + + +class TestPolygon: + def test_valid_polygon_constructs(self): + poly = Polygon(points=((0, 0), (10, 0), (10, 10), (0, 10))) + assert len(poly.points) == 4 + + def test_wrong_point_count_rejected(self): + with pytest.raises(ValidationError): + Polygon(points=((0, 0), (10, 0), (10, 10))) + + def test_is_frozen(self): + poly = Polygon(points=((0, 0), (10, 0), (10, 10), (0, 10))) + with pytest.raises(ValidationError): + poly.points = ((0, 0), (1, 0), (1, 1), (0, 1)) + + +class TestStructuralProvenance: + def test_minimal_valid_construction(self): + prov = StructuralProvenance( + marker_block_ids=["/page/0/Text/0"], + page_number=0, + reading_order_index=0, + ) + assert prov.bbox is None + assert prov.contributing_bboxes is None + assert prov.section_path == [] + + def test_empty_marker_block_ids_rejected(self): + with pytest.raises(ValidationError): + StructuralProvenance( + marker_block_ids=[], + page_number=0, + reading_order_index=0, + ) + + def test_negative_reading_order_index_rejected(self): + with pytest.raises(ValidationError): + StructuralProvenance( + marker_block_ids=["/page/0/Text/0"], + page_number=0, + reading_order_index=-1, + ) + + def test_bbox_and_contributing_bboxes_mutually_exclusive(self): + box = BoundingBox(x0=0, y0=0, x1=1, y1=1) + with pytest.raises(ValidationError): + StructuralProvenance( + marker_block_ids=["/page/0/Text/0"], + page_number=0, + reading_order_index=0, + bbox=box, + contributing_bboxes=[box], + ) + + def test_bbox_alone_is_valid(self): + box = BoundingBox(x0=0, y0=0, x1=1, y1=1) + prov = StructuralProvenance( + marker_block_ids=["/page/0/Text/0"], + page_number=0, + reading_order_index=0, + bbox=box, + ) + assert prov.bbox == box + + def test_contributing_bboxes_alone_is_valid(self): + box = BoundingBox(x0=0, y0=0, x1=1, y1=1) + prov = StructuralProvenance( + marker_block_ids=["/page/0/SectionHeader/0", "/page/0/Text/1"], + page_number=0, + reading_order_index=0, + contributing_bboxes=[box, box], + ) + assert len(prov.contributing_bboxes) == 2 + + def test_neither_bbox_field_is_valid(self): + # No recoverable geometry is a legitimate outcome (Section 3.3). + prov = StructuralProvenance( + marker_block_ids=["/page/0/Text/0"], + page_number=0, + reading_order_index=0, + ) + assert prov.bbox is None and prov.contributing_bboxes is None + + def test_section_path_preserves_order(self): + prov = StructuralProvenance( + marker_block_ids=["/page/7/TableCell/3"], + page_number=7, + reading_order_index=42, + section_path=[ + "/page/1/SectionHeader/1", + "/page/7/SectionHeader/0", + ], + ) + assert prov.section_path == [ + "/page/1/SectionHeader/1", + "/page/7/SectionHeader/0", + ] + + def test_is_frozen(self): + prov = StructuralProvenance( + marker_block_ids=["/page/0/Text/0"], + page_number=0, + reading_order_index=0, + ) + with pytest.raises(ValidationError): + prov.page_number = 1 \ No newline at end of file diff --git a/tests/document/test_section_and_page.py b/tests/document/test_section_and_page.py new file mode 100644 index 0000000..acb22a2 --- /dev/null +++ b/tests/document/test_section_and_page.py @@ -0,0 +1,195 @@ +"""Tests for Section (Spec Section 9) and Page (Spec Section 8), with +specific focus on the Version 1.1 correction: Reference is now a valid +Section.children member, and remains excluded from Page.children. +""" +from __future__ import annotations + +import pytest +from pydantic import TypeAdapter, ValidationError + +from betydb_extraction.document import ( + NodeKind, + Page, + PageChild, + Paragraph, + Reference, + Section, + SectionChild, + Table, +) + +from .conftest import ( + make_equation, + make_figure, + make_footnote, + make_page, + make_page_footer, + make_page_header, + make_paragraph, + make_provenance, + make_reference, + make_section, + make_table, +) + + +class TestSectionChildUnionIncludesReference: + """The Version 1.1 correction, verified directly.""" + + def test_reference_is_a_valid_section_child_by_direct_construction(self): + ref = make_reference() + section = make_section( + heading_text="References", + children=[ref], + ) + assert len(section.children) == 1 + assert isinstance(section.children[0], Reference) + assert section.children[0].kind == NodeKind.REFERENCE + + def test_section_with_mixed_children_including_reference(self): + section = make_section( + heading_text="References", + children=[make_paragraph(), make_reference(), make_reference("/page/9/ListItem/1")], + ) + kinds = [child.kind for child in section.children] + assert kinds == [NodeKind.PARAGRAPH, NodeKind.REFERENCE, NodeKind.REFERENCE] + + def test_discriminated_union_resolves_reference_from_dict(self): + # Simulates what happens when a Section is built from raw dict/JSON + # data (e.g. round-tripped) -- the discriminator must correctly + # route a 'reference' kind to the Reference model rather than + # raising or silently coercing to a different type. + ref = make_reference() + section = Section( + id=make_section().id, + heading_text="References", + provenance=make_provenance("/page/9/SectionHeader/0"), + depth=0, + children=[ref.model_dump()], + ) + assert isinstance(section.children[0], Reference) + + def test_section_child_type_adapter_accepts_reference(self): + adapter = TypeAdapter(SectionChild) + ref = make_reference() + resolved = adapter.validate_python(ref.model_dump()) + assert isinstance(resolved, Reference) + + def test_nested_sections_still_work_alongside_reference(self): + inner = make_section( + "/page/9/SectionHeader/1", + heading_text="References", + depth=1, + children=[make_reference()], + ) + outer = make_section( + "/page/9/SectionHeader/0", + heading_text="Back Matter", + depth=0, + children=[inner], + ) + assert isinstance(outer.children[0], Section) + assert isinstance(outer.children[0].children[0], Reference) + + def test_all_pre_v1_1_child_types_still_valid(self): + # Guards against the v1.1 change accidentally narrowing the union + # instead of only widening it. + section = make_section( + children=[ + make_paragraph(), + make_table(), + make_figure(), + make_equation(), + make_footnote(), + ] + ) + kinds = {child.kind for child in section.children} + assert kinds == { + NodeKind.PARAGRAPH, + NodeKind.TABLE, + NodeKind.FIGURE, + NodeKind.EQUATION, + NodeKind.FOOTNOTE, + } + + +class TestPageChildUnionExcludesReference: + """Per the Version 1.1 note added to Spec Section 8: Reference is + Section-only, not Page-only.""" + + def test_reference_is_not_in_page_child_union_members(self): + adapter = TypeAdapter(PageChild) + ref = make_reference() + with pytest.raises(ValidationError): + adapter.validate_python(ref.model_dump()) + + def test_page_rejects_reference_as_direct_child(self): + ref = make_reference() + with pytest.raises(ValidationError): + make_page(children=[ref]) + + def test_page_accepts_section_header_footer_and_all_pre_v1_1_types(self): + page = make_page( + children=[ + make_page_header(), + make_section(children=[make_paragraph()]), + make_table(), + make_figure(), + make_equation(), + make_footnote(), + make_page_footer(), + ] + ) + assert len(page.children) == 7 + + +class TestSection: + def test_constructs_with_kind_default(self): + section = make_section() + assert section.kind == NodeKind.SECTION + + def test_children_default_to_empty_list(self): + section = make_section() + assert section.children == [] + + def test_depth_must_be_non_negative(self): + with pytest.raises(ValidationError): + make_section(depth=-1) + + def test_is_frozen(self): + section = make_section() + with pytest.raises(ValidationError): + section.heading_text = "Different" + + def test_rejects_malformed_id(self): + with pytest.raises(ValidationError): + Section( + id="not-an-id", + heading_text="Methods", + provenance=make_provenance(), + depth=0, + ) + + +class TestPage: + def test_page_number_must_be_non_negative(self): + with pytest.raises(ValidationError): + make_page(page_number=-1) + + def test_children_default_to_empty_list(self): + page = make_page() + assert page.children == [] + + def test_is_front_matter_required(self): + with pytest.raises(ValidationError): + Page( + id=make_page().id, + page_number=0, + provenance=make_provenance("/page/0"), + children=[], + ) + + def test_is_frozen(self): + page = make_page() + with pytest.raises(ValidationError): + page.is_front_matter = True \ No newline at end of file diff --git a/tests/document/test_serialization.py b/tests/document/test_serialization.py new file mode 100644 index 0000000..d1ed55e --- /dev/null +++ b/tests/document/test_serialization.py @@ -0,0 +1,162 @@ +"""Serialization tests per Spec invariant 1.5 and Section 19: every model +must round-trip losslessly through model_dump()/model_validate() and +model_dump_json()/model_validate_json(). +""" +from __future__ import annotations + +import base64 + +from betydb_extraction.document import Figure + +from .conftest import ( + make_document, + make_equation, + make_footnote, + make_metadata, + make_page, + make_paragraph, + make_processing_metadata, + make_provenance, + make_reference, + make_section, + make_statistics, + make_table, +) + + +def _round_trip_dict(model): + cls = type(model) + dumped = model.model_dump() + rebuilt = cls.model_validate(dumped) + assert rebuilt == model + return dumped + + +def _round_trip_json(model): + cls = type(model) + dumped_json = model.model_dump_json() + rebuilt = cls.model_validate_json(dumped_json) + assert rebuilt == model + return dumped_json + + +class TestRoundTripDictAndJson: + def test_paragraph_round_trips(self): + p = make_paragraph() + _round_trip_dict(p) + _round_trip_json(p) + + def test_reference_round_trips(self): + ref = make_reference() + _round_trip_dict(ref) + _round_trip_json(ref) + + def test_table_round_trips(self): + t = make_table() + _round_trip_dict(t) + _round_trip_json(t) + + def test_equation_round_trips(self): + eq = make_equation() + _round_trip_dict(eq) + _round_trip_json(eq) + + def test_footnote_round_trips(self): + fn = make_footnote() + _round_trip_dict(fn) + _round_trip_json(fn) + + def test_section_with_reference_child_round_trips(self): + # Specifically exercises the v1.1-corrected union through a full + # JSON round trip, not just direct Python construction. + section = make_section( + heading_text="References", + children=[make_reference(), make_reference("/page/9/ListItem/1")], + ) + dumped = _round_trip_dict(section) + assert dumped["children"][0]["kind"] == "reference" + _round_trip_json(section) + + def test_page_round_trips(self): + page = make_page(children=[make_paragraph(), make_table()]) + _round_trip_dict(page) + _round_trip_json(page) + + def test_statistics_round_trips(self): + stats = make_statistics() + _round_trip_dict(stats) + _round_trip_json(stats) + + def test_metadata_round_trips(self): + meta = make_metadata() + _round_trip_dict(meta) + _round_trip_json(meta) + + def test_processing_metadata_round_trips_with_utc_datetime(self): + pm = make_processing_metadata() + dumped_json = _round_trip_json(pm) + assert "2026-06-17" in dumped_json + + def test_document_round_trips(self): + doc = make_document() + _round_trip_dict(doc) + _round_trip_json(doc) + + def test_document_with_full_tree_round_trips(self): + doc = make_document( + pages=[ + make_page( + 0, + children=[ + make_section( + "/page/0/SectionHeader/0", + heading_text="Methods", + children=[make_paragraph(), make_table()], + ), + make_section( + "/page/0/SectionHeader/1", + heading_text="References", + children=[make_reference()], + ), + ], + ) + ] + ) + _round_trip_dict(doc) + _round_trip_json(doc) + + +class TestFieldOrderingDeterminism: + def test_dump_json_field_order_matches_declaration_order(self): + p = make_paragraph() + dumped = p.model_dump_json() + # Declared order in Paragraph: kind, id, text, provenance. + assert dumped.index('"kind"') < dumped.index('"id"') < dumped.index( + '"text"' + ) < dumped.index('"provenance"') + + def test_repeated_dumps_are_byte_identical(self): + p = make_paragraph() + assert p.model_dump_json() == p.model_dump_json() + + +class TestBytesFieldSerialization: + def test_figure_image_data_serializes_as_base64_string(self): + raw_bytes = b"\x89PNG\r\n\x1a\nfake-bytes" + fig = Figure( + id=make_paragraph().id, # any well-formed id is fine here + provenance=make_provenance("/page/7/Figure/9"), + image_data=raw_bytes, + ) + dumped_json = fig.model_dump_json() + # Pydantic v2 base64-encodes bytes fields in JSON mode by default. + rebuilt = Figure.model_validate_json(dumped_json) + assert rebuilt.image_data == raw_bytes + + def test_figure_with_none_image_data_round_trips(self): + fig = Figure( + id=make_paragraph().id, + provenance=make_provenance("/page/7/Figure/10"), + ) + rebuilt = Figure.model_validate_json(fig.model_dump_json()) + assert rebuilt.image_data is None \ No newline at end of file diff --git a/tests/marker_adapter/__init__.py b/tests/marker_adapter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/marker_adapter/test_raw_model.py b/tests/marker_adapter/test_raw_model.py new file mode 100644 index 0000000..43c0458 --- /dev/null +++ b/tests/marker_adapter/test_raw_model.py @@ -0,0 +1,463 @@ +""" +Unit tests for the Raw Marker Model (`betydb_extraction.marker_adapter.raw_model`). + +Test strategy +------------- +Three tiers of coverage, in order of priority: + +1. Construction/validation against synthetic minimal inputs -- fast, + isolates one behavior per test, doesn't depend on any fixture file. +2. Construction/validation against the REAL Marker output for the + representative paper (Smukler et al. 2012), loaded from the uploads + directory. This is the ground-truth regression test: if a future edit + to this model breaks parsing of real Marker output, this test catches + it immediately. Skipped gracefully if the fixture file is not present + in this environment (e.g. CI without the uploaded fixture), rather than + failing the whole suite. +3. Round-trip serialization losslessness, both against synthetic inputs + and against the real document, since "lossless mirror" is the central + architectural guarantee of this layer and deserves direct, explicit + verification rather than being assumed from (1) and (2) passing. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from betydb_extraction.marker_adapter.raw_model import ( + MarkerBBox, + MarkerBlock, + MarkerDocument, + MarkerPolygonPoint, +) + +REAL_FIXTURE_PATH = Path( + "/mnt/user-data/uploads/1781681908897_Nutrient-cycling.json" +) + + +# --------------------------------------------------------------------------- +# Fixtures / helpers +# --------------------------------------------------------------------------- + + +def _minimal_leaf_block( + block_id: str = "/page/0/Text/0", + block_type: str = "Text", + html: str = "

hello

", +) -> dict: + """A minimal, valid leaf block dict matching Marker's observed envelope.""" + return { + "id": block_id, + "block_type": block_type, + "html": html, + "polygon": [[0.0, 0.0], [100.0, 0.0], [100.0, 10.0], [0.0, 10.0]], + "bbox": [0.0, 0.0, 100.0, 10.0], + "children": None, + "section_hierarchy": {}, + "images": {}, + } + + +def _minimal_container_block( + block_id: str, + block_type: str, + children: list[dict], +) -> dict: + """A minimal, valid container block dict wrapping the given children.""" + return { + "id": block_id, + "block_type": block_type, + "html": "".join(f"" for c in children), + "polygon": [[0.0, 0.0], [200.0, 0.0], [200.0, 200.0], [0.0, 200.0]], + "bbox": [0.0, 0.0, 200.0, 200.0], + "children": children, + "section_hierarchy": {}, + "images": {}, + } + + +@pytest.fixture(scope="module") +def real_marker_json() -> dict: + if not REAL_FIXTURE_PATH.exists(): + pytest.skip(f"Real Marker fixture not present at {REAL_FIXTURE_PATH}") + with open(REAL_FIXTURE_PATH) as f: + return json.load(f) + + +# --------------------------------------------------------------------------- +# 1. Construction / validation -- synthetic inputs +# --------------------------------------------------------------------------- + + +class TestMarkerPolygonPoint: + def test_construct_from_pair(self): + point = MarkerPolygonPoint.from_pair([12.5, 34.0]) + assert point.x == 12.5 + assert point.y == 34.0 + + def test_to_pair_round_trip(self): + point = MarkerPolygonPoint(x=1.0, y=2.0) + assert point.to_pair() == [1.0, 2.0] + + def test_is_frozen(self): + point = MarkerPolygonPoint(x=1.0, y=2.0) + with pytest.raises(Exception): + point.x = 5.0 # type: ignore[misc] + + def test_rejects_extra_fields(self): + with pytest.raises(Exception): + MarkerPolygonPoint(x=1.0, y=2.0, z=3.0) # type: ignore[call-arg] + + +class TestMarkerBBox: + def test_construct_from_list(self): + bbox = MarkerBBox.from_list([0.0, 0.0, 10.0, 20.0]) + assert (bbox.x0, bbox.y0, bbox.x1, bbox.y1) == (0.0, 0.0, 10.0, 20.0) + + def test_to_list_round_trip(self): + bbox = MarkerBBox(x0=0.0, y0=1.0, x1=2.0, y1=3.0) + assert bbox.to_list() == [0.0, 1.0, 2.0, 3.0] + + def test_is_frozen(self): + bbox = MarkerBBox(x0=0.0, y0=0.0, x1=1.0, y1=1.0) + with pytest.raises(Exception): + bbox.x0 = 99.0 # type: ignore[misc] + + +class TestMarkerBlockConstruction: + def test_construct_minimal_leaf(self): + block = MarkerBlock.model_validate(_minimal_leaf_block()) + assert block.block_type == "Text" + assert block.is_leaf() + assert block.children is None + + def test_construct_container_with_children(self): + leaf = _minimal_leaf_block() + container = _minimal_container_block( + "/page/0/TableGroup/1", "TableGroup", [leaf] + ) + block = MarkerBlock.model_validate(container) + assert not block.is_leaf() + assert len(block.children) == 1 + assert block.children[0].block_type == "Text" + + def test_polygon_coerced_from_raw_pairs(self): + block = MarkerBlock.model_validate(_minimal_leaf_block()) + assert isinstance(block.polygon[0], MarkerPolygonPoint) + assert block.polygon[0].x == 0.0 + assert block.polygon[0].y == 0.0 + + def test_bbox_coerced_from_raw_list(self): + block = MarkerBlock.model_validate(_minimal_leaf_block()) + assert isinstance(block.bbox, MarkerBBox) + assert block.bbox.x1 == 100.0 + + def test_root_document_block_with_no_id_or_geometry(self): + """ + The real Marker root block omits id, html, polygon, bbox, and + section_hierarchy entirely. This must parse successfully with + those fields defaulting to None/empty, not raise a validation + error -- this is a confirmed empirical fact (see module docstring + in raw_model.py), not a hypothetical edge case. + """ + data = { + "children": [_minimal_leaf_block()], + "block_type": "Document", + } + block = MarkerBlock.model_validate(data) + assert block.id is None + assert block.bbox is None + assert block.polygon is None + assert block.section_hierarchy == {} + assert len(block.children) == 1 + + def test_unknown_block_type_parses_successfully(self): + """ + A block_type the model has never seen before must still parse, + since block_type is a plain str, not a closed enum/discriminator. + This is the core guarantee that makes this layer insulated from + future Marker schema changes. + """ + data = _minimal_leaf_block(block_type="SomeFutureBlockType") + block = MarkerBlock.model_validate(data) + assert block.block_type == "SomeFutureBlockType" + + def test_unknown_extra_field_is_preserved_not_dropped(self): + data = _minimal_leaf_block() + data["confidence_score"] = 0.987 # hypothetical future Marker field + block = MarkerBlock.model_validate(data) + dumped = block.model_dump(mode="json") + assert dumped["confidence_score"] == 0.987 + + def test_missing_required_block_type_raises(self): + data = _minimal_leaf_block() + del data["block_type"] + with pytest.raises(Exception): + MarkerBlock.model_validate(data) + + def test_is_frozen_immutable(self): + block = MarkerBlock.model_validate(_minimal_leaf_block()) + with pytest.raises(Exception): + block.html = "

mutated

" # type: ignore[misc] + + def test_nested_children_are_also_frozen(self): + leaf = _minimal_leaf_block() + container = _minimal_container_block( + "/page/0/TableGroup/1", "TableGroup", [leaf] + ) + block = MarkerBlock.model_validate(container) + with pytest.raises(Exception): + block.children[0].html = "

mutated

" # type: ignore[misc] + + def test_iter_descendants_depth_first_order(self): + grandchild = _minimal_leaf_block("/page/0/Text/2", "Text") + child_container = _minimal_container_block( + "/page/0/Caption/1", "Caption", [grandchild] + ) + root_container = _minimal_container_block( + "/page/0/TableGroup/0", "TableGroup", [child_container] + ) + block = MarkerBlock.model_validate(root_container) + descendants = block.iter_descendants() + assert [d.id for d in descendants] == [ + "/page/0/Caption/1", + "/page/0/Text/2", + ] + + def test_iter_descendants_empty_for_leaf(self): + block = MarkerBlock.model_validate(_minimal_leaf_block()) + assert block.iter_descendants() == [] + + def test_images_dict_preserved(self): + data = _minimal_leaf_block(block_type="Picture") + data["images"] = {"/page/0/Picture/0": "base64fakepayload=="} + block = MarkerBlock.model_validate(data) + assert block.images == {"/page/0/Picture/0": "base64fakepayload=="} + + def test_children_none_vs_empty_list_distinction_preserved(self): + """ + Marker emits `children: None` for true leaves, never `children: []`. + This model must preserve that distinction exactly rather than + normalizing both to one representation. + """ + leaf_data = _minimal_leaf_block() + leaf_data["children"] = None + leaf_block = MarkerBlock.model_validate(leaf_data) + assert leaf_block.children is None + + empty_list_data = _minimal_leaf_block() + empty_list_data["children"] = [] + empty_list_block = MarkerBlock.model_validate(empty_list_data) + assert empty_list_block.children == [] + # These are deliberately NOT equal in meaning -- is_leaf() should + # only be True for the None case. + assert leaf_block.is_leaf() is True + assert empty_list_block.is_leaf() is False + + +class TestMarkerDocument: + def test_wraps_root_block(self): + leaf = _minimal_leaf_block() + root_data = _minimal_container_block("doc-root", "Document", [leaf]) + root_block = MarkerBlock.model_validate(root_data) + doc = MarkerDocument(root=root_block) + assert doc.root.block_type == "Document" + + def test_pages_property_returns_root_children(self): + page_block = _minimal_container_block("/page/0/Page/0", "Page", []) + root_data = _minimal_container_block("doc-root", "Document", [page_block]) + root_block = MarkerBlock.model_validate(root_data) + doc = MarkerDocument(root=root_block) + assert len(doc.pages) == 1 + assert doc.pages[0].block_type == "Page" + + def test_pages_property_empty_when_no_children(self): + root_block = MarkerBlock.model_validate( + {"children": None, "block_type": "Document"} + ) + doc = MarkerDocument(root=root_block) + assert doc.pages == [] + + def test_source_path_optional_and_retained(self): + root_block = MarkerBlock.model_validate( + {"children": None, "block_type": "Document"} + ) + doc = MarkerDocument(root=root_block, source_marker_json_path="/tmp/x.json") + assert doc.source_marker_json_path == "/tmp/x.json" + + def test_is_frozen(self): + root_block = MarkerBlock.model_validate( + {"children": None, "block_type": "Document"} + ) + doc = MarkerDocument(root=root_block) + with pytest.raises(Exception): + doc.source_marker_json_path = "/tmp/other.json" # type: ignore[misc] + + +# --------------------------------------------------------------------------- +# 2. Construction / validation -- real Marker output (ground truth) +# --------------------------------------------------------------------------- + + +class TestRealMarkerDocument: + def test_parses_without_error(self, real_marker_json): + block = MarkerBlock.model_validate(real_marker_json) + assert block.block_type == "Document" + + def test_root_has_no_id(self, real_marker_json): + """Confirmed empirical fact: the real root block has no `id` field.""" + block = MarkerBlock.model_validate(real_marker_json) + assert block.id is None + + def test_page_count_matches_known_value(self, real_marker_json): + block = MarkerBlock.model_validate(real_marker_json) + doc = MarkerDocument(root=block) + assert len(doc.pages) == 17 + + def test_total_descendant_count_matches_known_census(self, real_marker_json): + """ + Regression guard against the global block-type census recorded in + marker_empirical_findings_paper1.md (1085 total descendant blocks). + """ + block = MarkerBlock.model_validate(real_marker_json) + assert len(block.iter_descendants()) == 1085 + + def test_table_group_pattern_caption_then_table(self, real_marker_json): + """ + Confirmed pattern: every TableGroup's children are exactly + [Caption, Table], in that order (see findings doc section 3.1). + """ + block = MarkerBlock.model_validate(real_marker_json) + table_groups = [ + d for d in block.iter_descendants() if d.block_type == "TableGroup" + ] + assert len(table_groups) == 3 + for tg in table_groups: + assert len(tg.children) == 2 + assert tg.children[0].block_type == "Caption" + assert tg.children[1].block_type == "Table" + + def test_figure_group_pattern_figure_then_caption(self, real_marker_json): + """ + Confirmed pattern: FigureGroup's children are exactly + [Figure, Caption] (note: reversed order vs. TableGroup). + """ + block = MarkerBlock.model_validate(real_marker_json) + figure_groups = [ + d for d in block.iter_descendants() if d.block_type == "FigureGroup" + ] + assert len(figure_groups) == 1 + fg = figure_groups[0] + assert len(fg.children) == 2 + assert fg.children[0].block_type == "Figure" + assert fg.children[1].block_type == "Caption" + + def test_page_7_has_bare_tables_and_flat_footnotes(self, real_marker_json): + """ + Confirmed pattern: page 7 has two Table blocks NOT wrapped in a + TableGroup, and three Footnote blocks as flat page-level siblings + rather than nested under their related table (findings doc 3.1/3.2). + """ + block = MarkerBlock.model_validate(real_marker_json) + doc = MarkerDocument(root=block) + page7 = doc.pages[7] + child_types = [c.block_type for c in page7.children] + assert child_types.count("Table") == 2 + assert child_types.count("Footnote") == 3 + assert "TableGroup" not in child_types + + def test_table_block_has_both_html_and_table_cell_children(self, real_marker_json): + """ + Confirmed pattern: a Table block carries a fully-formed + HTML structure AND a flat list of TableCell children with their + own geometry (findings doc 3.3) -- both representations coexist. + """ + block = MarkerBlock.model_validate(real_marker_json) + tables = [d for d in block.iter_descendants() if d.block_type == "Table"] + assert len(tables) == 7 + for table in tables: + assert "
" in table.html + assert table.children is not None + assert all(c.block_type == "TableCell" for c in table.children) + + def test_picture_blocks_carry_image_payloads(self, real_marker_json): + block = MarkerBlock.model_validate(real_marker_json) + pictures = [d for d in block.iter_descendants() if d.block_type == "Picture"] + assert len(pictures) == 2 + for pic in pictures: + assert pic.images + assert len(next(iter(pic.images.values()))) > 100 # real base64 payload + + +# --------------------------------------------------------------------------- +# 3. Round-trip serialization losslessness +# --------------------------------------------------------------------------- + + +class TestRoundTripSerialization: + def test_synthetic_block_round_trip_via_dict(self): + leaf = _minimal_leaf_block() + container = _minimal_container_block( + "/page/0/TableGroup/0", "TableGroup", [leaf] + ) + block = MarkerBlock.model_validate(container) + dumped = block.model_dump(mode="json") + reparsed = MarkerBlock.model_validate(dumped) + assert reparsed.model_dump(mode="json") == dumped + + def test_synthetic_block_round_trip_via_json_string(self): + block = MarkerBlock.model_validate(_minimal_leaf_block()) + json_str = block.model_dump_json() + reparsed = MarkerBlock.model_validate_json(json_str) + assert reparsed.model_dump(mode="json") == block.model_dump(mode="json") + + def test_real_document_round_trip_via_dict_is_lossless(self, real_marker_json): + block = MarkerBlock.model_validate(real_marker_json) + dumped = block.model_dump(mode="json") + reparsed = MarkerBlock.model_validate(dumped) + assert reparsed.model_dump(mode="json") == dumped + + def test_real_document_round_trip_preserves_descendant_ids_in_order( + self, real_marker_json + ): + block = MarkerBlock.model_validate(real_marker_json) + dumped = block.model_dump(mode="json") + reparsed = MarkerBlock.model_validate(dumped) + orig_ids = [d.id for d in block.iter_descendants()] + reparsed_ids = [d.id for d in reparsed.iter_descendants()] + assert orig_ids == reparsed_ids + + def test_real_document_round_trip_via_json_string_is_lossless( + self, real_marker_json + ): + block = MarkerBlock.model_validate(real_marker_json) + json_str = block.model_dump_json() + reparsed = MarkerBlock.model_validate_json(json_str) + assert reparsed.model_dump(mode="json") == block.model_dump(mode="json") + + def test_serialization_is_deterministic_across_repeated_dumps( + self, real_marker_json + ): + """ + The same in-memory object must always produce byte-identical JSON + across repeated calls -- a precondition for "same document always + produces same structural representation" further downstream. + """ + block = MarkerBlock.model_validate(real_marker_json) + dump1 = block.model_dump_json() + dump2 = block.model_dump_json() + assert dump1 == dump2 + + def test_unknown_extra_field_survives_full_round_trip(self): + data = _minimal_leaf_block() + data["some_future_field"] = {"nested": [1, 2, 3]} + block = MarkerBlock.model_validate(data) + json_str = block.model_dump_json() + reparsed = MarkerBlock.model_validate_json(json_str) + assert reparsed.model_dump(mode="json")["some_future_field"] == { + "nested": [1, 2, 3] + }