eLifePathways · de-code · Jun 3, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/sciencebeam_parser/document/layout_noise_filter.py b/sciencebeam_parser/document/layout_noise_filter.py
@@ -0,0 +1,241 @@
+import logging
+import math
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple
+
+from sciencebeam_parser.document.layout_document import (
+    LayoutBlock,
+    LayoutDocument,
+    LayoutPage,
+)
+
+LOGGER = logging.getLogger(__name__)
+
+
+class _BlockOccurrence(NamedTuple):
+    page_index: int
+    y_relative: Optional[float]
+    height_relative: Optional[float]
+    block: LayoutBlock
+
+
+@dataclass
+class LayoutNoiseFilterConfig:
+    enabled: bool = False
+    repetition_fraction: float = 0.5
+    # Fraction of occurrences that must fall in the top/bottom quartile zone
+    position_consistency_fraction: float = 0.8
+    # Max standard deviation of y_relative across qualifying occurrences
+    max_position_stddev: float = 0.05
+    # Occurrences whose height exceeds this multiple of the group median are not filtered
+    # (catches e.g. a large title on page 1 that also repeats as a small footer)
+    max_height_ratio: float = 2.0
+    # Never filter running-head blocks on page 1 (the paper title lives there)
+    preserve_first_page_head: bool = True
+    # Never filter running-foot blocks on page 1 (footers have no special status on page 1)
+    preserve_first_page_foot: bool = False
+
+
+@dataclass
+class TaggedNoiseBlock:
+    block: LayoutBlock
+    note_type: str  # "running-head" | "running-foot"
+
+
+def _get_block_y_relative(block: LayoutBlock, page: LayoutPage) -> Optional[float]:
+    page_height = (
+        page.meta.coordinates.height
+        if page.meta and page.meta.coordinates
+        else None
+    )
+    if not page_height:
+        return None
+    y_values = [
+        token.coordinates.y
+        for token in block.iter_all_tokens()
+        if token.coordinates
+    ]
+    if not y_values:
+        return None
+    return min(y_values) / page_height
+
+
+def _get_block_height_relative(block: LayoutBlock, page: LayoutPage) -> Optional[float]:
+    page_height = (
+        page.meta.coordinates.height
+        if page.meta and page.meta.coordinates
+        else None
+    )
+    if not page_height:
+        return None
+    extents = [
+        (token.coordinates.y, token.coordinates.y + token.coordinates.height)
+        for token in block.iter_all_tokens()
+        if token.coordinates
+    ]
+    if not extents:
+        return None
+    return (max(y1 for _, y1 in extents) - min(y0 for y0, _ in extents)) / page_height
+
+
+def _stddev(values: List[float]) -> float:
+    mean = sum(values) / len(values)
+    return math.sqrt(sum((v - mean) ** 2 for v in values) / len(values))
+
+
+def _compute_page_quartiles(
+    page_block_y_rels: Dict[int, List[float]]
+) -> Tuple[Dict[int, float], Dict[int, float]]:
+    """Return (q1_per_page, q3_per_page) — 25th and 75th percentile of y_relative."""
+    q1_map: Dict[int, float] = {}
+    q3_map: Dict[int, float] = {}
+    for page_index, ys in page_block_y_rels.items():
+        if not ys:
+            continue
+        sorted_ys = sorted(ys)
+        n = len(sorted_ys)
+        q1_map[page_index] = sorted_ys[n // 4]
+        q3_map[page_index] = sorted_ys[min(3 * n // 4, n - 1)]
+    return q1_map, q3_map
+
+
+def _in_zone(
+    page_idx: int,
+    y_rel: float,
+    note_type: str,
+    page_q1_y: Dict[int, float],
+    page_q3_y: Dict[int, float],
+) -> bool:
+    if note_type == 'running-head':
+        return page_idx in page_q1_y and y_rel < page_q1_y[page_idx]
+    return page_idx in page_q3_y and y_rel > page_q3_y[page_idx]
+
+
+def _classify_repetition_group(
+    occurrences: List[_BlockOccurrence],
+    page_q1_y: Dict[int, float],
+    page_q3_y: Dict[int, float],
+    consistency_fraction: float,
+    max_position_stddev: float,
+) -> Optional[str]:
+    classifiable = sum(
+        1 for occ in occurrences
+        if occ.y_relative is not None and occ.page_index in page_q1_y
+    )
+    if not classifiable:
+        return None
+    for note_type in ('running-head', 'running-foot'):
+        zone = [
+            occ for occ in occurrences
+            if occ.y_relative is not None
+            and _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y)
+        ]
+        if len(zone) / classifiable < consistency_fraction:
+            continue
+        y_rels = [occ.y_relative for occ in zone if occ.y_relative is not None]
+        if len(y_rels) > 1 and _stddev(y_rels) > max_position_stddev:
+            continue
+        return note_type
+    return None
+
+
+def _tag_noise_occurrences(
+    occurrences: List[_BlockOccurrence],
+    note_type: str,
+    page_q1_y: Dict[int, float],
+    page_q3_y: Dict[int, float],
+    max_height_ratio: float,
+    preserve_first_page: bool,
+) -> List[TaggedNoiseBlock]:
+    zone_heights = [
+        occ.height_relative for occ in occurrences
+        if occ.height_relative is not None and occ.y_relative is not None
+        and _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y)
+    ]
+    median_height = sorted(zone_heights)[len(zone_heights) // 2] if zone_heights else None
+    result = []
+    for occ in occurrences:
+        if preserve_first_page and occ.page_index == 0:
+            continue
+        if occ.y_relative is None:
+            continue
+        if not _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y):
+            continue
+        if (median_height and occ.height_relative is not None
+                and occ.height_relative > max_height_ratio * median_height):
+            continue
+        result.append(TaggedNoiseBlock(block=occ.block, note_type=note_type))
+    return result
+
+
+def _collect_blocks(
+    layout_document: LayoutDocument,
+) -> Tuple[Dict[str, List[_BlockOccurrence]], Dict[int, List[float]]]:
+    text_to_occurrences: Dict[str, List[_BlockOccurrence]] = defaultdict(list)
+    page_block_y_rels: Dict[int, List[float]] = defaultdict(list)
+    for page_index, page in enumerate(layout_document.pages):
+        for block in page.blocks:
+            text = block.text.strip().casefold()
+            if not text:
+                continue
+            y_rel = _get_block_y_relative(block, page)
+            h_rel = _get_block_height_relative(block, page)
+            text_to_occurrences[text].append(
+                _BlockOccurrence(page_index, y_rel, h_rel, block)
+            )
+            if y_rel is not None:
+                page_block_y_rels[page_index].append(y_rel)
+    return dict(text_to_occurrences), dict(page_block_y_rels)
+
+
+def get_noise_blocks(
+    layout_document: LayoutDocument,
+    config: LayoutNoiseFilterConfig,
+) -> Sequence[TaggedNoiseBlock]:
+    if not config.enabled:
+        return []
+    total_pages = len(layout_document.pages)
+    if total_pages < 2:
+        return []
+    text_to_occurrences, page_block_y_rels = _collect_blocks(layout_document)
+    page_q1_y, page_q3_y = _compute_page_quartiles(page_block_y_rels)
+    threshold = max(2.0, config.repetition_fraction * total_pages)
+    noise_blocks: List[TaggedNoiseBlock] = []
+    for _text, occurrences in text_to_occurrences.items():
+        if len(occurrences) < threshold:
+            continue
+        note_type = _classify_repetition_group(
+            occurrences, page_q1_y, page_q3_y,
+            config.position_consistency_fraction,
+            config.max_position_stddev,
+        )
+        if not note_type:
+            continue
+        preserve = (
+            config.preserve_first_page_head
+            if note_type == 'running-head'
+            else config.preserve_first_page_foot
+        )
+        noise_blocks.extend(_tag_noise_occurrences(
+            occurrences, note_type, page_q1_y, page_q3_y,
+            config.max_height_ratio, preserve
+        ))
+    LOGGER.debug('found %d layout noise blocks', len(noise_blocks))
+    return noise_blocks
+
+
+def remove_noise_blocks(
+    layout_document: LayoutDocument,
+    noise_blocks: Sequence[TaggedNoiseBlock],
+) -> LayoutDocument:
+    if not noise_blocks:
+        return layout_document
+    excluded_ids: Set[int] = {id(nb.block) for nb in noise_blocks}
+    return LayoutDocument(pages=[
+        page.replace(blocks=[
+            block for block in page.blocks
+            if id(block) not in excluded_ids
+        ])
+        for page in layout_document.pages
+    ])
diff --git a/sciencebeam_parser/processors/fulltext/config.py b/sciencebeam_parser/processors/fulltext/config.py
@@ -48,6 +48,10 @@ class FullTextProcessorConfig(NamedTuple):
     use_ocr_model: bool = False
     replace_text_by_cv_graphic: bool = False
     max_graphic_distance: float = DEFAULT_MAX_GRAPHIC_DISTANCE
+    noise_filter_enabled: bool = False
+    noise_filter_repetition_fraction: float = 0.5
+    noise_filter_preserve_first_page_head: bool = True
+    noise_filter_preserve_first_page_foot: bool = False
 
     @staticmethod
     def from_app_config(app_config: AppConfig) -> 'FullTextProcessorConfig':

diff --git a/sciencebeam_parser/processors/fulltext/processor.py b/sciencebeam_parser/processors/fulltext/processor.py
@@ -33,6 +33,7 @@
     SemanticLabel,
     SemanticMixedContentWrapper,
     SemanticMixedNote,
+    SemanticNote,
     SemanticRawAffiliationAddress,
     SemanticRawAuthors,
     SemanticRawEditors,
@@ -53,6 +54,11 @@
 )
 from sciencebeam_parser.document.tei_document import TeiDocument, get_tei_for_semantic_document
 from sciencebeam_parser.document.layout_document import LayoutDocument
+from sciencebeam_parser.document.layout_noise_filter import (
+    LayoutNoiseFilterConfig,
+    get_noise_blocks,
+    remove_noise_blocks,
+)
 from sciencebeam_parser.models.segmentation.model import SegmentationModel
 from sciencebeam_parser.models.header.model import HeaderModel
 from sciencebeam_parser.models.name.model import NameModel
@@ -188,8 +194,18 @@ def get_semantic_document_for_layout_document(
             layout_document,
             context=context
         )
-        segmentation_label_result = self.segmentation_model.get_label_layout_document_result(
+        noise_blocks = get_noise_blocks(
             layout_document,
+            LayoutNoiseFilterConfig(
+                enabled=self.config.noise_filter_enabled,
+                repetition_fraction=self.config.noise_filter_repetition_fraction,
+                preserve_first_page_head=self.config.noise_filter_preserve_first_page_head,
+                preserve_first_page_foot=self.config.noise_filter_preserve_first_page_foot,
+            )
+        )
+        segmentation_input = remove_noise_blocks(layout_document, noise_blocks)
+        segmentation_label_result = self.segmentation_model.get_label_layout_document_result(
+            segmentation_input,
             app_features_context=self.app_features_context
         )
         header_layout_document = segmentation_label_result.get_filtered_document_by_label(
@@ -265,6 +281,11 @@ def get_semantic_document_for_layout_document(
             self._assign_target_content_ids(table_citations, SimpleContentIdMatcher(
                 self._get_semantic_content_text_by_content_id(tables, SemanticLabel)
             ))
+        for nb in noise_blocks:
+            document.body_section.add_content(SemanticNote(
+                layout_block=nb.block,
+                note_type=nb.note_type
+            ))
         if self.config.extract_graphic_bounding_boxes:
             self._process_graphics(
                 document=document,

diff --git a/sciencebeam_parser/resources/default_config/config.yml b/sciencebeam_parser/resources/default_config/config.yml
@@ -74,6 +74,10 @@ lookup:
       - https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/names.family
 processors:
   fulltext:
+    noise_filter_enabled: true
+    noise_filter_repetition_fraction: 0.5
+    noise_filter_preserve_first_page_head: true
+    noise_filter_preserve_first_page_foot: false
     merge_raw_authors: false
     use_cv_model: false
     cv_render_dpi: 100