From 12f888968bbb411421814a333ba957d8c0f964d3 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Wed, 27 May 2026 22:07:05 +0100 Subject: [PATCH 1/5] Filter running headers, footers and page numbers before segmentation related to https://github.com/eLifePathways/ScienceBeam2.0/issues/61 Add a pre-segmentation noise filter that detects layout blocks repeating at the top or bottom of pages across a document (running heads, running feet) using position and cross-page text repetition. Detected blocks are excluded from the segmentation model input and preserved in the output XML as / elements for auditability. Enabled by default via noise_filter_enabled in config.yml. --- .../document/layout_noise_filter.py | 109 ++++++++++++ .../processors/fulltext/config.py | 2 + .../processors/fulltext/processor.py | 21 ++- .../resources/default_config/config.yml | 1 + tests/document/layout_noise_filter_test.py | 155 ++++++++++++++++++ 5 files changed, 287 insertions(+), 1 deletion(-) create mode 100644 sciencebeam_parser/document/layout_noise_filter.py create mode 100644 tests/document/layout_noise_filter_test.py diff --git a/sciencebeam_parser/document/layout_noise_filter.py b/sciencebeam_parser/document/layout_noise_filter.py new file mode 100644 index 00000000..d7b1bb97 --- /dev/null +++ b/sciencebeam_parser/document/layout_noise_filter.py @@ -0,0 +1,109 @@ +import logging +from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, List, Optional, Sequence, Set, Tuple + +from sciencebeam_parser.document.layout_document import ( + LayoutBlock, + LayoutDocument, + LayoutPage, +) + +LOGGER = logging.getLogger(__name__) + + +@dataclass +class LayoutNoiseFilterConfig: + enabled: bool = False + repetition_fraction: float = 0.5 + + +@dataclass +class TaggedNoiseBlock: + block: LayoutBlock + note_type: str # "running-head" | "running-foot" + + +def _get_block_y_relative(block: LayoutBlock, page: LayoutPage) -> Optional[float]: + page_height = ( + page.meta.coordinates.height + if page.meta and page.meta.coordinates + else None + ) + if not page_height: + return None + y_values = [ + token.coordinates.y + for token in block.iter_all_tokens() + if token.coordinates + ] + if not y_values: + return None + return min(y_values) / page_height + + +def _classify_repetition_group( + occurrences: List[Tuple[int, Optional[float], LayoutBlock]] +) -> Optional[str]: + y_values = [y for _, y, _ in occurrences if y is not None] + if not y_values: + return None + median_y = sorted(y_values)[len(y_values) // 2] + if median_y < 0.2: + return 'running-head' + if median_y > 0.8: + return 'running-foot' + return None + + +def get_noise_blocks( + layout_document: LayoutDocument, + config: LayoutNoiseFilterConfig, +) -> Sequence[TaggedNoiseBlock]: + if not config.enabled: + return [] + total_pages = len(layout_document.pages) + if total_pages < 2: + return [] + + text_to_occurrences: Dict[str, List[Tuple[int, Optional[float], LayoutBlock]]] = ( + defaultdict(list) + ) + for page_index, page in enumerate(layout_document.pages): + for block in page.blocks: + text = block.text.strip().casefold() + if not text: + continue + y_rel = _get_block_y_relative(block, page) + text_to_occurrences[text].append((page_index, y_rel, block)) + + threshold = max(2.0, config.repetition_fraction * total_pages) + noise_blocks: List[TaggedNoiseBlock] = [] + + for _text, occurrences in text_to_occurrences.items(): + if len(occurrences) < threshold: + continue + note_type = _classify_repetition_group(occurrences) + if not note_type: + continue + for _, _, block in occurrences: + noise_blocks.append(TaggedNoiseBlock(block=block, note_type=note_type)) + + LOGGER.debug('found %d layout noise blocks', len(noise_blocks)) + return noise_blocks + + +def remove_noise_blocks( + layout_document: LayoutDocument, + noise_blocks: Sequence[TaggedNoiseBlock], +) -> LayoutDocument: + if not noise_blocks: + return layout_document + excluded_ids: Set[int] = {id(nb.block) for nb in noise_blocks} + return LayoutDocument(pages=[ + page.replace(blocks=[ + block for block in page.blocks + if id(block) not in excluded_ids + ]) + for page in layout_document.pages + ]) diff --git a/sciencebeam_parser/processors/fulltext/config.py b/sciencebeam_parser/processors/fulltext/config.py index 27ce06f9..28a556b1 100644 --- a/sciencebeam_parser/processors/fulltext/config.py +++ b/sciencebeam_parser/processors/fulltext/config.py @@ -48,6 +48,8 @@ class FullTextProcessorConfig(NamedTuple): use_ocr_model: bool = False replace_text_by_cv_graphic: bool = False max_graphic_distance: float = DEFAULT_MAX_GRAPHIC_DISTANCE + noise_filter_enabled: bool = False + noise_filter_repetition_fraction: float = 0.5 @staticmethod def from_app_config(app_config: AppConfig) -> 'FullTextProcessorConfig': diff --git a/sciencebeam_parser/processors/fulltext/processor.py b/sciencebeam_parser/processors/fulltext/processor.py index 01a99931..21ef43f6 100644 --- a/sciencebeam_parser/processors/fulltext/processor.py +++ b/sciencebeam_parser/processors/fulltext/processor.py @@ -33,6 +33,7 @@ SemanticLabel, SemanticMixedContentWrapper, SemanticMixedNote, + SemanticNote, SemanticRawAffiliationAddress, SemanticRawAuthors, SemanticRawEditors, @@ -53,6 +54,11 @@ ) from sciencebeam_parser.document.tei_document import TeiDocument, get_tei_for_semantic_document from sciencebeam_parser.document.layout_document import LayoutDocument +from sciencebeam_parser.document.layout_noise_filter import ( + LayoutNoiseFilterConfig, + get_noise_blocks, + remove_noise_blocks, +) from sciencebeam_parser.models.segmentation.model import SegmentationModel from sciencebeam_parser.models.header.model import HeaderModel from sciencebeam_parser.models.name.model import NameModel @@ -188,8 +194,16 @@ def get_semantic_document_for_layout_document( layout_document, context=context ) - segmentation_label_result = self.segmentation_model.get_label_layout_document_result( + noise_blocks = get_noise_blocks( layout_document, + LayoutNoiseFilterConfig( + enabled=self.config.noise_filter_enabled, + repetition_fraction=self.config.noise_filter_repetition_fraction, + ) + ) + segmentation_input = remove_noise_blocks(layout_document, noise_blocks) + segmentation_label_result = self.segmentation_model.get_label_layout_document_result( + segmentation_input, app_features_context=self.app_features_context ) header_layout_document = segmentation_label_result.get_filtered_document_by_label( @@ -265,6 +279,11 @@ def get_semantic_document_for_layout_document( self._assign_target_content_ids(table_citations, SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(tables, SemanticLabel) )) + for nb in noise_blocks: + document.body_section.add_content(SemanticNote( + layout_block=nb.block, + note_type=nb.note_type + )) if self.config.extract_graphic_bounding_boxes: self._process_graphics( document=document, diff --git a/sciencebeam_parser/resources/default_config/config.yml b/sciencebeam_parser/resources/default_config/config.yml index c787cc90..57258db4 100644 --- a/sciencebeam_parser/resources/default_config/config.yml +++ b/sciencebeam_parser/resources/default_config/config.yml @@ -74,6 +74,7 @@ lookup: - https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/names.family processors: fulltext: + noise_filter_enabled: true merge_raw_authors: false use_cv_model: false cv_render_dpi: 100 diff --git a/tests/document/layout_noise_filter_test.py b/tests/document/layout_noise_filter_test.py new file mode 100644 index 00000000..71ef3093 --- /dev/null +++ b/tests/document/layout_noise_filter_test.py @@ -0,0 +1,155 @@ +from sciencebeam_parser.document.layout_document import ( + LayoutBlock, + LayoutDocument, + LayoutLine, + LayoutPage, + LayoutPageCoordinates, + LayoutPageMeta, + LayoutToken, +) +from sciencebeam_parser.document.layout_noise_filter import ( + LayoutNoiseFilterConfig, + TaggedNoiseBlock, + get_noise_blocks, + remove_noise_blocks, +) + + +PAGE_HEIGHT = 1000 +PAGE_WIDTH = 600 + +ENABLED_CONFIG = LayoutNoiseFilterConfig(enabled=True, repetition_fraction=0.5) + + +def _page_meta(page_number: int = 1) -> LayoutPageMeta: + return LayoutPageMeta( + page_number=page_number, + coordinates=LayoutPageCoordinates( + x=0, y=0, width=PAGE_WIDTH, height=PAGE_HEIGHT, page_number=page_number + ) + ) + + +def _block_at_y(text: str, y: float, page_number: int = 1) -> LayoutBlock: + token = LayoutToken( + text=text, + coordinates=LayoutPageCoordinates( + x=10, y=y, width=200, height=20, page_number=page_number + ) + ) + return LayoutBlock(lines=[LayoutLine(tokens=[token])]) + + +def _page(blocks: list, page_number: int = 1) -> LayoutPage: + return LayoutPage(blocks=blocks, meta=_page_meta(page_number)) + + +def _doc(*pages: LayoutPage) -> LayoutDocument: + return LayoutDocument(pages=list(pages)) + + +class TestGetNoiseBlocks: + def test_returns_empty_when_disabled(self): + header = _block_at_y('Journal Name', y=10) + doc = _doc( + _page([header], page_number=1), + _page([_block_at_y('Journal Name', y=10)], page_number=2), + ) + result = get_noise_blocks(doc, LayoutNoiseFilterConfig(enabled=False)) + assert not result + + def test_returns_empty_for_single_page(self): + doc = _doc(_page([_block_at_y('Journal Name', y=10)])) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert not result + + def test_detects_running_head_at_top(self): + # y=10 / PAGE_HEIGHT=1000 → y_relative=0.01 < 0.2 + header_text = 'Journal of Something' + doc = _doc( + _page([_block_at_y(header_text, y=10)], page_number=1), + _page([_block_at_y(header_text, y=10)], page_number=2), + _page([_block_at_y(header_text, y=10)], page_number=3), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + note_types = {nb.note_type for nb in result} + assert note_types == {'running-head'} + assert len(result) == 3 + + def test_detects_running_foot_at_bottom(self): + # y=900 / PAGE_HEIGHT=1000 → y_relative=0.9 > 0.8 + footer_text = 'Copyright 2024' + doc = _doc( + _page([_block_at_y(footer_text, y=900)], page_number=1), + _page([_block_at_y(footer_text, y=900)], page_number=2), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + note_types = {nb.note_type for nb in result} + assert note_types == {'running-foot'} + + def test_does_not_flag_non_repeating_block(self): + doc = _doc( + _page([_block_at_y('Unique text page 1', y=10)], page_number=1), + _page([_block_at_y('Unique text page 2', y=10)], page_number=2), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert not result + + def test_does_not_flag_repeating_block_in_middle(self): + # y=500 / PAGE_HEIGHT=1000 → y_relative=0.5 — middle of page, not noise + mid_text = 'Section Title' + doc = _doc( + _page([_block_at_y(mid_text, y=500)], page_number=1), + _page([_block_at_y(mid_text, y=500)], page_number=2), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert not result + + def test_respects_repetition_fraction(self): + # 2 out of 5 pages = 0.4, below default threshold of 0.5 → not flagged + header_text = 'Sparse Header' + doc = _doc( + _page([_block_at_y(header_text, y=10)], page_number=1), + _page([_block_at_y('body text', y=400)], page_number=2), + _page([_block_at_y('body text', y=400)], page_number=3), + _page([_block_at_y('body text', y=400)], page_number=4), + _page([_block_at_y(header_text, y=10)], page_number=5), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert not any(nb.block.text.strip() == header_text for nb in result) + + def test_case_insensitive_normalisation(self): + doc = _doc( + _page([_block_at_y('JOURNAL NAME', y=10)], page_number=1), + _page([_block_at_y('journal name', y=10)], page_number=2), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert len(result) == 2 + + +class TestRemoveNoiseBlocks: + def test_returns_same_document_when_no_noise(self): + doc = _doc(_page([_block_at_y('body', y=400)])) + assert remove_noise_blocks(doc, []) is doc + + def test_removes_tagged_blocks(self): + body = _block_at_y('body text', y=400) + header = _block_at_y('Journal Name', y=10) + page = _page([body, header]) + doc = _doc(page) + noise = [TaggedNoiseBlock(block=header, note_type='running-head')] + result = remove_noise_blocks(doc, noise) + remaining = list(result.iter_all_blocks()) + assert body in remaining + assert header not in remaining + + def test_preserves_page_structure(self): + b1 = _block_at_y('block 1', y=400, page_number=1) + b2 = _block_at_y('header', y=10, page_number=1) + b3 = _block_at_y('block 3', y=400, page_number=2) + doc = _doc(_page([b1, b2], page_number=1), _page([b3], page_number=2)) + noise = [TaggedNoiseBlock(block=b2, note_type='running-head')] + result = remove_noise_blocks(doc, noise) + assert len(result.pages) == 2 + assert len(result.pages[0].blocks) == 1 + assert len(result.pages[1].blocks) == 1 From a8d52d1591a89e83186e5b1b0c70f0dde991b411 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Mon, 1 Jun 2026 12:17:28 +0100 Subject: [PATCH 2/5] Replace fixed position thresholds with data-driven quartile detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite the noise block classifier to avoid hardcoded 0.2/0.8 page fractions. Instead, the top/bottom quartile of each page's own block y-distribution defines the noise zone, so the threshold adapts to each document's layout. Two additional guards prevent false positives: - Stddev check: position must be stable across occurrences - Height check: occurrences whose block height exceeds 2× the group median are not filtered (catches a large title on page 1 that also repeats as a small footer on pages 2+, as seen in scielo_br) --- .../document/layout_noise_filter.py | 161 +++++++++++++++--- tests/document/layout_noise_filter_test.py | 109 ++++++++---- 2 files changed, 214 insertions(+), 56 deletions(-) diff --git a/sciencebeam_parser/document/layout_noise_filter.py b/sciencebeam_parser/document/layout_noise_filter.py index d7b1bb97..89dd87dc 100644 --- a/sciencebeam_parser/document/layout_noise_filter.py +++ b/sciencebeam_parser/document/layout_noise_filter.py @@ -1,4 +1,5 @@ import logging +import math from collections import defaultdict from dataclasses import dataclass from typing import Dict, List, Optional, Sequence, Set, Tuple @@ -11,11 +12,21 @@ LOGGER = logging.getLogger(__name__) +# Occurrence: (page_index, y_relative, height_relative, block) +_Occurrence = Tuple[int, Optional[float], Optional[float], LayoutBlock] + @dataclass class LayoutNoiseFilterConfig: enabled: bool = False repetition_fraction: float = 0.5 + # Fraction of occurrences that must fall in the top/bottom quartile zone + position_consistency_fraction: float = 0.8 + # Max standard deviation of y_relative across qualifying occurrences + max_position_stddev: float = 0.05 + # Occurrences whose height exceeds this multiple of the group median are not filtered + # (catches e.g. a large title on page 1 that also repeats as a small footer) + max_height_ratio: float = 2.0 @dataclass @@ -42,20 +53,127 @@ def _get_block_y_relative(block: LayoutBlock, page: LayoutPage) -> Optional[floa return min(y_values) / page_height +def _get_block_height_relative(block: LayoutBlock, page: LayoutPage) -> Optional[float]: + page_height = ( + page.meta.coordinates.height + if page.meta and page.meta.coordinates + else None + ) + if not page_height: + return None + extents = [ + (token.coordinates.y, token.coordinates.y + token.coordinates.height) + for token in block.iter_all_tokens() + if token.coordinates + ] + if not extents: + return None + return (max(y1 for _, y1 in extents) - min(y0 for y0, _ in extents)) / page_height + + +def _stddev(values: List[float]) -> float: + mean = sum(values) / len(values) + return math.sqrt(sum((v - mean) ** 2 for v in values) / len(values)) + + +def _compute_page_quartiles( + page_block_y_rels: Dict[int, List[float]] +) -> Tuple[Dict[int, float], Dict[int, float]]: + """Return (q1_per_page, q3_per_page) — 25th and 75th percentile of y_relative.""" + q1_map: Dict[int, float] = {} + q3_map: Dict[int, float] = {} + for page_index, ys in page_block_y_rels.items(): + if not ys: + continue + sorted_ys = sorted(ys) + n = len(sorted_ys) + q1_map[page_index] = sorted_ys[n // 4] + q3_map[page_index] = sorted_ys[min(3 * n // 4, n - 1)] + return q1_map, q3_map + + +def _in_zone( + page_idx: int, + y_rel: float, + note_type: str, + page_q1_y: Dict[int, float], + page_q3_y: Dict[int, float], +) -> bool: + if note_type == 'running-head': + return page_idx in page_q1_y and y_rel < page_q1_y[page_idx] + return page_idx in page_q3_y and y_rel > page_q3_y[page_idx] + + def _classify_repetition_group( - occurrences: List[Tuple[int, Optional[float], LayoutBlock]] + occurrences: List[_Occurrence], + page_q1_y: Dict[int, float], + page_q3_y: Dict[int, float], + consistency_fraction: float, + max_position_stddev: float, ) -> Optional[str]: - y_values = [y for _, y, _ in occurrences if y is not None] - if not y_values: + classifiable = sum( + 1 for idx, y_rel, _h, _b in occurrences + if y_rel is not None and idx in page_q1_y + ) + if not classifiable: return None - median_y = sorted(y_values)[len(y_values) // 2] - if median_y < 0.2: - return 'running-head' - if median_y > 0.8: - return 'running-foot' + for note_type in ('running-head', 'running-foot'): + zone = [ + (idx, y_rel) for idx, y_rel, _h, _b in occurrences + if y_rel is not None + and _in_zone(idx, y_rel, note_type, page_q1_y, page_q3_y) + ] + if len(zone) / classifiable < consistency_fraction: + continue + y_rels = [y for _, y in zone] + if len(y_rels) > 1 and _stddev(y_rels) > max_position_stddev: + continue + return note_type return None +def _tag_noise_occurrences( + occurrences: List[_Occurrence], + note_type: str, + page_q1_y: Dict[int, float], + page_q3_y: Dict[int, float], + max_height_ratio: float, +) -> List[TaggedNoiseBlock]: + zone_heights = [ + h for idx, y_rel, h, _ in occurrences + if h is not None and y_rel is not None + and _in_zone(idx, y_rel, note_type, page_q1_y, page_q3_y) + ] + median_height = sorted(zone_heights)[len(zone_heights) // 2] if zone_heights else None + result = [] + for page_idx, y_rel, height_rel, block in occurrences: + if y_rel is None or not _in_zone(page_idx, y_rel, note_type, page_q1_y, page_q3_y): + continue + if (median_height and height_rel is not None + and height_rel > max_height_ratio * median_height): + continue + result.append(TaggedNoiseBlock(block=block, note_type=note_type)) + return result + + +def _collect_blocks( + layout_document: LayoutDocument, +) -> Tuple[Dict[str, List[_Occurrence]], Dict[int, List[float]]]: + text_to_occurrences: Dict[str, List[_Occurrence]] = defaultdict(list) + page_block_y_rels: Dict[int, List[float]] = defaultdict(list) + for page_index, page in enumerate(layout_document.pages): + for block in page.blocks: + text = block.text.strip().casefold() + if not text: + continue + y_rel = _get_block_y_relative(block, page) + h_rel = _get_block_height_relative(block, page) + text_to_occurrences[text].append((page_index, y_rel, h_rel, block)) + if y_rel is not None: + page_block_y_rels[page_index].append(y_rel) + return dict(text_to_occurrences), dict(page_block_y_rels) + + def get_noise_blocks( layout_document: LayoutDocument, config: LayoutNoiseFilterConfig, @@ -65,30 +183,23 @@ def get_noise_blocks( total_pages = len(layout_document.pages) if total_pages < 2: return [] - - text_to_occurrences: Dict[str, List[Tuple[int, Optional[float], LayoutBlock]]] = ( - defaultdict(list) - ) - for page_index, page in enumerate(layout_document.pages): - for block in page.blocks: - text = block.text.strip().casefold() - if not text: - continue - y_rel = _get_block_y_relative(block, page) - text_to_occurrences[text].append((page_index, y_rel, block)) - + text_to_occurrences, page_block_y_rels = _collect_blocks(layout_document) + page_q1_y, page_q3_y = _compute_page_quartiles(page_block_y_rels) threshold = max(2.0, config.repetition_fraction * total_pages) noise_blocks: List[TaggedNoiseBlock] = [] - for _text, occurrences in text_to_occurrences.items(): if len(occurrences) < threshold: continue - note_type = _classify_repetition_group(occurrences) + note_type = _classify_repetition_group( + occurrences, page_q1_y, page_q3_y, + config.position_consistency_fraction, + config.max_position_stddev, + ) if not note_type: continue - for _, _, block in occurrences: - noise_blocks.append(TaggedNoiseBlock(block=block, note_type=note_type)) - + noise_blocks.extend(_tag_noise_occurrences( + occurrences, note_type, page_q1_y, page_q3_y, config.max_height_ratio + )) LOGGER.debug('found %d layout noise blocks', len(noise_blocks)) return noise_blocks diff --git a/tests/document/layout_noise_filter_test.py b/tests/document/layout_noise_filter_test.py index 71ef3093..b8901972 100644 --- a/tests/document/layout_noise_filter_test.py +++ b/tests/document/layout_noise_filter_test.py @@ -17,9 +17,13 @@ PAGE_HEIGHT = 1000 PAGE_WIDTH = 600 - ENABLED_CONFIG = LayoutNoiseFilterConfig(enabled=True, repetition_fraction=0.5) +# Body blocks at these y positions give a spread across the middle of the page. +# y_rels: 0.2, 0.3, 0.5, 0.6, 0.7 → q1≈0.3, q3≈0.6 when combined with a +# header at y_rel≈0.01 or footer at y_rel≈0.95. +_BODY_Y_POSITIONS = [200, 300, 500, 600, 700] + def _page_meta(page_number: int = 1) -> LayoutPageMeta: return LayoutPageMeta( @@ -30,11 +34,13 @@ def _page_meta(page_number: int = 1) -> LayoutPageMeta: ) -def _block_at_y(text: str, y: float, page_number: int = 1) -> LayoutBlock: +def _block_at_y( + text: str, y: float, page_number: int = 1, height: float = 20 +) -> LayoutBlock: token = LayoutToken( text=text, coordinates=LayoutPageCoordinates( - x=10, y=y, width=200, height=20, page_number=page_number + x=10, y=y, width=200, height=height, page_number=page_number ) ) return LayoutBlock(lines=[LayoutLine(tokens=[token])]) @@ -44,88 +50,129 @@ def _page(blocks: list, page_number: int = 1) -> LayoutPage: return LayoutPage(blocks=blocks, meta=_page_meta(page_number)) +def _page_with_body(extra_blocks: list, page_number: int) -> LayoutPage: + """Build a realistic page with unique body content plus any extra blocks.""" + body_blocks = [ + _block_at_y(f'body p{page_number} i{i}', y=y, page_number=page_number) + for i, y in enumerate(_BODY_Y_POSITIONS) + ] + return _page(extra_blocks + body_blocks, page_number=page_number) + + def _doc(*pages: LayoutPage) -> LayoutDocument: return LayoutDocument(pages=list(pages)) class TestGetNoiseBlocks: def test_returns_empty_when_disabled(self): - header = _block_at_y('Journal Name', y=10) doc = _doc( - _page([header], page_number=1), - _page([_block_at_y('Journal Name', y=10)], page_number=2), + _page_with_body([_block_at_y('Journal Name', y=10)], page_number=1), + _page_with_body([_block_at_y('Journal Name', y=10)], page_number=2), ) result = get_noise_blocks(doc, LayoutNoiseFilterConfig(enabled=False)) assert not result def test_returns_empty_for_single_page(self): - doc = _doc(_page([_block_at_y('Journal Name', y=10)])) + doc = _doc(_page_with_body([_block_at_y('Journal Name', y=10)], page_number=1)) result = get_noise_blocks(doc, ENABLED_CONFIG) assert not result def test_detects_running_head_at_top(self): - # y=10 / PAGE_HEIGHT=1000 → y_relative=0.01 < 0.2 + # y=10 → y_rel=0.01, well below q1 of each page header_text = 'Journal of Something' doc = _doc( - _page([_block_at_y(header_text, y=10)], page_number=1), - _page([_block_at_y(header_text, y=10)], page_number=2), - _page([_block_at_y(header_text, y=10)], page_number=3), + _page_with_body([_block_at_y(header_text, y=10)], page_number=1), + _page_with_body([_block_at_y(header_text, y=10)], page_number=2), + _page_with_body([_block_at_y(header_text, y=10)], page_number=3), ) result = get_noise_blocks(doc, ENABLED_CONFIG) - note_types = {nb.note_type for nb in result} - assert note_types == {'running-head'} + assert {nb.note_type for nb in result} == {'running-head'} assert len(result) == 3 def test_detects_running_foot_at_bottom(self): - # y=900 / PAGE_HEIGHT=1000 → y_relative=0.9 > 0.8 + # y=950 → y_rel=0.95, well above q3 of each page footer_text = 'Copyright 2024' doc = _doc( - _page([_block_at_y(footer_text, y=900)], page_number=1), - _page([_block_at_y(footer_text, y=900)], page_number=2), + _page_with_body([_block_at_y(footer_text, y=950)], page_number=1), + _page_with_body([_block_at_y(footer_text, y=950)], page_number=2), ) result = get_noise_blocks(doc, ENABLED_CONFIG) - note_types = {nb.note_type for nb in result} - assert note_types == {'running-foot'} + assert {nb.note_type for nb in result} == {'running-foot'} def test_does_not_flag_non_repeating_block(self): doc = _doc( - _page([_block_at_y('Unique text page 1', y=10)], page_number=1), - _page([_block_at_y('Unique text page 2', y=10)], page_number=2), + _page_with_body([_block_at_y('Unique text page 1', y=10)], page_number=1), + _page_with_body([_block_at_y('Unique text page 2', y=10)], page_number=2), ) result = get_noise_blocks(doc, ENABLED_CONFIG) assert not result def test_does_not_flag_repeating_block_in_middle(self): - # y=500 / PAGE_HEIGHT=1000 → y_relative=0.5 — middle of page, not noise + # y=400 → y_rel=0.4, between q1 and q3 → neither zone mid_text = 'Section Title' doc = _doc( - _page([_block_at_y(mid_text, y=500)], page_number=1), - _page([_block_at_y(mid_text, y=500)], page_number=2), + _page_with_body([_block_at_y(mid_text, y=400)], page_number=1), + _page_with_body([_block_at_y(mid_text, y=400)], page_number=2), ) result = get_noise_blocks(doc, ENABLED_CONFIG) - assert not result + assert not any(nb.block.text.strip() == mid_text for nb in result) + + def test_does_not_flag_block_with_inconsistent_position(self): + # Same text but moves between top and bottom across pages → high stddev → not noise + wandering_text = 'Wandering Block' + doc = _doc( + _page_with_body([_block_at_y(wandering_text, y=10)], page_number=1), + _page_with_body([_block_at_y(wandering_text, y=950)], page_number=2), + _page_with_body([_block_at_y(wandering_text, y=10)], page_number=3), + _page_with_body([_block_at_y(wandering_text, y=950)], page_number=4), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert not any(nb.block.text.strip() == wandering_text for nb in result) def test_respects_repetition_fraction(self): # 2 out of 5 pages = 0.4, below default threshold of 0.5 → not flagged header_text = 'Sparse Header' doc = _doc( - _page([_block_at_y(header_text, y=10)], page_number=1), - _page([_block_at_y('body text', y=400)], page_number=2), - _page([_block_at_y('body text', y=400)], page_number=3), - _page([_block_at_y('body text', y=400)], page_number=4), - _page([_block_at_y(header_text, y=10)], page_number=5), + _page_with_body([_block_at_y(header_text, y=10)], page_number=1), + _page_with_body([], page_number=2), + _page_with_body([], page_number=3), + _page_with_body([], page_number=4), + _page_with_body([_block_at_y(header_text, y=10)], page_number=5), ) result = get_noise_blocks(doc, ENABLED_CONFIG) assert not any(nb.block.text.strip() == header_text for nb in result) def test_case_insensitive_normalisation(self): doc = _doc( - _page([_block_at_y('JOURNAL NAME', y=10)], page_number=1), - _page([_block_at_y('journal name', y=10)], page_number=2), + _page_with_body([_block_at_y('JOURNAL NAME', y=10)], page_number=1), + _page_with_body([_block_at_y('journal name', y=10)], page_number=2), ) result = get_noise_blocks(doc, ENABLED_CONFIG) assert len(result) == 2 + def test_large_title_on_first_page_not_filtered_when_repeated_as_small_footer(self): + # Regression (scielo_br): a paper title appears large (height=60) near the + # top of page 1, and the same text repeats as a small footer (height=8) at + # the bottom of pages 2+. The height check must preserve the page-1 title. + title_text = 'A radicalização do debate sobre inclusão escolar no Brasil' + doc = _doc( + # page 1: large title near the top + _page_with_body([_block_at_y(title_text, y=50, height=60)], page_number=1), + # pages 2-5: same text as small footer at the bottom + _page_with_body([_block_at_y(title_text, y=950, height=8)], page_number=2), + _page_with_body([_block_at_y(title_text, y=950, height=8)], page_number=3), + _page_with_body([_block_at_y(title_text, y=950, height=8)], page_number=4), + _page_with_body([_block_at_y(title_text, y=950, height=8)], page_number=5), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + # Footer occurrences on pages 2-5 are filtered + assert all(nb.note_type == 'running-foot' for nb in result) + assert len(result) == 4 + # The large page-1 title must not be among the filtered blocks + filtered_ids = {id(nb.block) for nb in result} + page1_title_block = doc.pages[0].blocks[0] + assert id(page1_title_block) not in filtered_ids + class TestRemoveNoiseBlocks: def test_returns_same_document_when_no_noise(self): From 6d4e8c1f1d1a72deef730254e1172f45ea6bf441 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Mon, 1 Jun 2026 12:37:09 +0100 Subject: [PATCH 3/5] improve readability using _BlockOccurrence --- .../document/layout_noise_filter.py | 52 +++++++++++-------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/sciencebeam_parser/document/layout_noise_filter.py b/sciencebeam_parser/document/layout_noise_filter.py index 89dd87dc..e261866d 100644 --- a/sciencebeam_parser/document/layout_noise_filter.py +++ b/sciencebeam_parser/document/layout_noise_filter.py @@ -2,7 +2,7 @@ import math from collections import defaultdict from dataclasses import dataclass -from typing import Dict, List, Optional, Sequence, Set, Tuple +from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple from sciencebeam_parser.document.layout_document import ( LayoutBlock, @@ -12,8 +12,12 @@ LOGGER = logging.getLogger(__name__) -# Occurrence: (page_index, y_relative, height_relative, block) -_Occurrence = Tuple[int, Optional[float], Optional[float], LayoutBlock] + +class _BlockOccurrence(NamedTuple): + page_index: int + y_relative: Optional[float] + height_relative: Optional[float] + block: LayoutBlock @dataclass @@ -105,27 +109,27 @@ def _in_zone( def _classify_repetition_group( - occurrences: List[_Occurrence], + occurrences: List[_BlockOccurrence], page_q1_y: Dict[int, float], page_q3_y: Dict[int, float], consistency_fraction: float, max_position_stddev: float, ) -> Optional[str]: classifiable = sum( - 1 for idx, y_rel, _h, _b in occurrences - if y_rel is not None and idx in page_q1_y + 1 for occ in occurrences + if occ.y_relative is not None and occ.page_index in page_q1_y ) if not classifiable: return None for note_type in ('running-head', 'running-foot'): zone = [ - (idx, y_rel) for idx, y_rel, _h, _b in occurrences - if y_rel is not None - and _in_zone(idx, y_rel, note_type, page_q1_y, page_q3_y) + occ for occ in occurrences + if occ.y_relative is not None + and _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y) ] if len(zone) / classifiable < consistency_fraction: continue - y_rels = [y for _, y in zone] + y_rels = [occ.y_relative for occ in zone if occ.y_relative is not None] if len(y_rels) > 1 and _stddev(y_rels) > max_position_stddev: continue return note_type @@ -133,33 +137,35 @@ def _classify_repetition_group( def _tag_noise_occurrences( - occurrences: List[_Occurrence], + occurrences: List[_BlockOccurrence], note_type: str, page_q1_y: Dict[int, float], page_q3_y: Dict[int, float], max_height_ratio: float, ) -> List[TaggedNoiseBlock]: zone_heights = [ - h for idx, y_rel, h, _ in occurrences - if h is not None and y_rel is not None - and _in_zone(idx, y_rel, note_type, page_q1_y, page_q3_y) + occ.height_relative for occ in occurrences + if occ.height_relative is not None and occ.y_relative is not None + and _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y) ] median_height = sorted(zone_heights)[len(zone_heights) // 2] if zone_heights else None result = [] - for page_idx, y_rel, height_rel, block in occurrences: - if y_rel is None or not _in_zone(page_idx, y_rel, note_type, page_q1_y, page_q3_y): + for occ in occurrences: + if occ.y_relative is None: + continue + if not _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y): continue - if (median_height and height_rel is not None - and height_rel > max_height_ratio * median_height): + if (median_height and occ.height_relative is not None + and occ.height_relative > max_height_ratio * median_height): continue - result.append(TaggedNoiseBlock(block=block, note_type=note_type)) + result.append(TaggedNoiseBlock(block=occ.block, note_type=note_type)) return result def _collect_blocks( layout_document: LayoutDocument, -) -> Tuple[Dict[str, List[_Occurrence]], Dict[int, List[float]]]: - text_to_occurrences: Dict[str, List[_Occurrence]] = defaultdict(list) +) -> Tuple[Dict[str, List[_BlockOccurrence]], Dict[int, List[float]]]: + text_to_occurrences: Dict[str, List[_BlockOccurrence]] = defaultdict(list) page_block_y_rels: Dict[int, List[float]] = defaultdict(list) for page_index, page in enumerate(layout_document.pages): for block in page.blocks: @@ -168,7 +174,9 @@ def _collect_blocks( continue y_rel = _get_block_y_relative(block, page) h_rel = _get_block_height_relative(block, page) - text_to_occurrences[text].append((page_index, y_rel, h_rel, block)) + text_to_occurrences[text].append( + _BlockOccurrence(page_index, y_rel, h_rel, block) + ) if y_rel is not None: page_block_y_rels[page_index].append(y_rel) return dict(text_to_occurrences), dict(page_block_y_rels) From 79e2fa6a579a5f450b420f2a1e6a4e48f6d3a772 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Tue, 2 Jun 2026 12:39:19 +0100 Subject: [PATCH 4/5] Also use noise filtering in low level models API endpoints --- .../service/api/routers/models.py | 53 ++++++++++++++----- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/sciencebeam_parser/service/api/routers/models.py b/sciencebeam_parser/service/api/routers/models.py index 6faa7806..63ac43c2 100644 --- a/sciencebeam_parser/service/api/routers/models.py +++ b/sciencebeam_parser/service/api/routers/models.py @@ -21,6 +21,11 @@ normalize_layout_document ) from sciencebeam_parser.document.layout_document import LayoutDocument +from sciencebeam_parser.document.layout_noise_filter import ( + LayoutNoiseFilterConfig, + get_noise_blocks, + remove_noise_blocks, +) from sciencebeam_parser.document.semantic_document import ( SemanticMixedContentWrapper, SemanticRawAffiliationAddress, @@ -66,13 +71,15 @@ def __init__( model: Model, pdfalto_wrapper: PdfAltoWrapper, app_features_context: AppFeaturesContext, - model_name: str = 'dummy' + model_name: str = 'dummy', + noise_filter_config: Optional[LayoutNoiseFilterConfig] = None ): self.name = name self.model = model self.pdfalto_wrapper = pdfalto_wrapper self.app_features_context = app_features_context self.model_name = model_name + self.noise_filter_config = noise_filter_config def _register_feature_names_route(self, router: APIRouter) -> None: @router.get('/feature-names') @@ -110,6 +117,12 @@ def process_post( ) return router + def _apply_noise_filter(self, layout_document: LayoutDocument) -> LayoutDocument: + if not self.noise_filter_config or not self.noise_filter_config.enabled: + return layout_document + noise_blocks = get_noise_blocks(layout_document, self.noise_filter_config) + return remove_noise_blocks(layout_document, noise_blocks) + def iter_filter_layout_document( self, layout_document: LayoutDocument, @@ -140,8 +153,8 @@ def handle_post( # pylint: disable=too-many-locals xml_content = output_path.read_bytes() root = etree.fromstring(xml_content) layout_document_iterable = self.iter_filter_layout_document( - normalize_layout_document( - parse_alto_root(root) + self._apply_noise_filter( + normalize_layout_document(parse_alto_root(root)) ), filter_params=(filter_params or {}) ) @@ -520,13 +533,18 @@ def create_models_router( fulltext_models = sciencebeam_parser.fulltext_models app_features_context = sciencebeam_parser.app_features_context fulltext_processor_config = sciencebeam_parser.fulltext_processor_config + noise_filter_config = LayoutNoiseFilterConfig( + enabled=fulltext_processor_config.noise_filter_enabled, + repetition_fraction=fulltext_processor_config.noise_filter_repetition_fraction, + ) router.include_router( ModelResponseRouterFactory( 'Segmentation', model=fulltext_models.segmentation_model, pdfalto_wrapper=pdfalto_wrapper, - app_features_context=app_features_context + app_features_context=app_features_context, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/segmentation' ) @@ -538,7 +556,8 @@ def create_models_router( pdfalto_wrapper=pdfalto_wrapper, app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, - segmentation_labels=['
'] + segmentation_labels=['
'], + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/header' ) @@ -552,7 +571,8 @@ def create_models_router( segmentation_model=fulltext_models.segmentation_model, segmentation_labels=['
'], header_model=fulltext_models.header_model, - merge_raw_authors=fulltext_processor_config.merge_raw_authors + merge_raw_authors=fulltext_processor_config.merge_raw_authors, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/name-header' ) @@ -565,7 +585,8 @@ def create_models_router( app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, segmentation_labels=['
'], - header_model=fulltext_models.header_model + header_model=fulltext_models.header_model, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/affiliation-address' ) @@ -579,7 +600,8 @@ def create_models_router( pdfalto_wrapper=pdfalto_wrapper, app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, - segmentation_labels=fulltext_segmentation_labels + segmentation_labels=fulltext_segmentation_labels, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/fulltext' ) @@ -592,7 +614,8 @@ def create_models_router( app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, segmentation_labels=fulltext_segmentation_labels, - fulltext_model=fulltext_models.fulltext_model + fulltext_model=fulltext_models.fulltext_model, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/figure' ) @@ -605,7 +628,8 @@ def create_models_router( app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, segmentation_labels=fulltext_segmentation_labels, - fulltext_model=fulltext_models.fulltext_model + fulltext_model=fulltext_models.fulltext_model, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/table' ) @@ -617,7 +641,8 @@ def create_models_router( pdfalto_wrapper=pdfalto_wrapper, app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, - segmentation_labels=[''] + segmentation_labels=[''], + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/reference-segmenter' ) @@ -630,7 +655,8 @@ def create_models_router( app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, segmentation_labels=[''], - reference_segmenter_model=fulltext_models.reference_segmenter_model + reference_segmenter_model=fulltext_models.reference_segmenter_model, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/citation' ) @@ -644,7 +670,8 @@ def create_models_router( segmentation_model=fulltext_models.segmentation_model, segmentation_labels=[''], reference_segmenter_model=fulltext_models.reference_segmenter_model, - citation_model=fulltext_models.citation_model + citation_model=fulltext_models.citation_model, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/name-citation' ) From f51f26cc16846ab46dd37bc6ae64deb1b880342f Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Wed, 3 Jun 2026 10:54:21 +0100 Subject: [PATCH 5/5] Add configurable first-page preservation for head and foot separately Introduce preserve_first_page_head (default true) and preserve_first_page_foot (default false) to LayoutNoiseFilterConfig. Running heads on page 1 are preserved because the paper title lives there and is indistinguishable by position from a running head. Running footers on page 1 have no such special status and are filtered like any other page, since footers can appear between body text when paragraphs cross page boundaries. All four noise filter settings are now surfaced in config.yml. --- .../document/layout_noise_filter.py | 15 ++++++- .../processors/fulltext/config.py | 2 + .../processors/fulltext/processor.py | 2 + .../resources/default_config/config.yml | 3 ++ .../service/api/routers/models.py | 2 + tests/document/layout_noise_filter_test.py | 39 ++++++++++++++++++- 6 files changed, 60 insertions(+), 3 deletions(-) diff --git a/sciencebeam_parser/document/layout_noise_filter.py b/sciencebeam_parser/document/layout_noise_filter.py index e261866d..dad8c5d1 100644 --- a/sciencebeam_parser/document/layout_noise_filter.py +++ b/sciencebeam_parser/document/layout_noise_filter.py @@ -31,6 +31,10 @@ class LayoutNoiseFilterConfig: # Occurrences whose height exceeds this multiple of the group median are not filtered # (catches e.g. a large title on page 1 that also repeats as a small footer) max_height_ratio: float = 2.0 + # Never filter running-head blocks on page 1 (the paper title lives there) + preserve_first_page_head: bool = True + # Never filter running-foot blocks on page 1 (footers have no special status on page 1) + preserve_first_page_foot: bool = False @dataclass @@ -142,6 +146,7 @@ def _tag_noise_occurrences( page_q1_y: Dict[int, float], page_q3_y: Dict[int, float], max_height_ratio: float, + preserve_first_page: bool, ) -> List[TaggedNoiseBlock]: zone_heights = [ occ.height_relative for occ in occurrences @@ -151,6 +156,8 @@ def _tag_noise_occurrences( median_height = sorted(zone_heights)[len(zone_heights) // 2] if zone_heights else None result = [] for occ in occurrences: + if preserve_first_page and occ.page_index == 0: + continue if occ.y_relative is None: continue if not _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y): @@ -205,8 +212,14 @@ def get_noise_blocks( ) if not note_type: continue + preserve = ( + config.preserve_first_page_head + if note_type == 'running-head' + else config.preserve_first_page_foot + ) noise_blocks.extend(_tag_noise_occurrences( - occurrences, note_type, page_q1_y, page_q3_y, config.max_height_ratio + occurrences, note_type, page_q1_y, page_q3_y, + config.max_height_ratio, preserve )) LOGGER.debug('found %d layout noise blocks', len(noise_blocks)) return noise_blocks diff --git a/sciencebeam_parser/processors/fulltext/config.py b/sciencebeam_parser/processors/fulltext/config.py index 28a556b1..54f1c6b7 100644 --- a/sciencebeam_parser/processors/fulltext/config.py +++ b/sciencebeam_parser/processors/fulltext/config.py @@ -50,6 +50,8 @@ class FullTextProcessorConfig(NamedTuple): max_graphic_distance: float = DEFAULT_MAX_GRAPHIC_DISTANCE noise_filter_enabled: bool = False noise_filter_repetition_fraction: float = 0.5 + noise_filter_preserve_first_page_head: bool = True + noise_filter_preserve_first_page_foot: bool = False @staticmethod def from_app_config(app_config: AppConfig) -> 'FullTextProcessorConfig': diff --git a/sciencebeam_parser/processors/fulltext/processor.py b/sciencebeam_parser/processors/fulltext/processor.py index 21ef43f6..531fd8a8 100644 --- a/sciencebeam_parser/processors/fulltext/processor.py +++ b/sciencebeam_parser/processors/fulltext/processor.py @@ -199,6 +199,8 @@ def get_semantic_document_for_layout_document( LayoutNoiseFilterConfig( enabled=self.config.noise_filter_enabled, repetition_fraction=self.config.noise_filter_repetition_fraction, + preserve_first_page_head=self.config.noise_filter_preserve_first_page_head, + preserve_first_page_foot=self.config.noise_filter_preserve_first_page_foot, ) ) segmentation_input = remove_noise_blocks(layout_document, noise_blocks) diff --git a/sciencebeam_parser/resources/default_config/config.yml b/sciencebeam_parser/resources/default_config/config.yml index b935819a..869c52ce 100644 --- a/sciencebeam_parser/resources/default_config/config.yml +++ b/sciencebeam_parser/resources/default_config/config.yml @@ -75,6 +75,9 @@ lookup: processors: fulltext: noise_filter_enabled: true + noise_filter_repetition_fraction: 0.5 + noise_filter_preserve_first_page_head: true + noise_filter_preserve_first_page_foot: false merge_raw_authors: false use_cv_model: false cv_render_dpi: 100 diff --git a/sciencebeam_parser/service/api/routers/models.py b/sciencebeam_parser/service/api/routers/models.py index 63ac43c2..e65fa8cd 100644 --- a/sciencebeam_parser/service/api/routers/models.py +++ b/sciencebeam_parser/service/api/routers/models.py @@ -536,6 +536,8 @@ def create_models_router( noise_filter_config = LayoutNoiseFilterConfig( enabled=fulltext_processor_config.noise_filter_enabled, repetition_fraction=fulltext_processor_config.noise_filter_repetition_fraction, + preserve_first_page_head=fulltext_processor_config.noise_filter_preserve_first_page_head, + preserve_first_page_foot=fulltext_processor_config.noise_filter_preserve_first_page_foot, ) router.include_router( diff --git a/tests/document/layout_noise_filter_test.py b/tests/document/layout_noise_filter_test.py index b8901972..00fe927d 100644 --- a/tests/document/layout_noise_filter_test.py +++ b/tests/document/layout_noise_filter_test.py @@ -87,7 +87,7 @@ def test_detects_running_head_at_top(self): ) result = get_noise_blocks(doc, ENABLED_CONFIG) assert {nb.note_type for nb in result} == {'running-head'} - assert len(result) == 3 + assert len(result) == 2 # page 1 preserved by default def test_detects_running_foot_at_bottom(self): # y=950 → y_rel=0.95, well above q3 of each page @@ -98,6 +98,7 @@ def test_detects_running_foot_at_bottom(self): ) result = get_noise_blocks(doc, ENABLED_CONFIG) assert {nb.note_type for nb in result} == {'running-foot'} + assert len(result) == 2 # page 1 footer filtered (preserve_first_page_foot=False) def test_does_not_flag_non_repeating_block(self): doc = _doc( @@ -148,7 +149,7 @@ def test_case_insensitive_normalisation(self): _page_with_body([_block_at_y('journal name', y=10)], page_number=2), ) result = get_noise_blocks(doc, ENABLED_CONFIG) - assert len(result) == 2 + assert len(result) == 1 # page 1 preserved by default def test_large_title_on_first_page_not_filtered_when_repeated_as_small_footer(self): # Regression (scielo_br): a paper title appears large (height=60) near the @@ -173,6 +174,40 @@ def test_large_title_on_first_page_not_filtered_when_repeated_as_small_footer(se page1_title_block = doc.pages[0].blocks[0] assert id(page1_title_block) not in filtered_ids + def test_preserve_first_page_head_protects_page_1_running_head(self): + # When height check alone is insufficient (same height on all pages), + # preserve_first_page_head=True keeps the page-1 head occurrence. + header_text = 'Running Header Same Size Everywhere' + config_preserve = LayoutNoiseFilterConfig( + enabled=True, repetition_fraction=0.5, preserve_first_page_head=True + ) + config_no_preserve = LayoutNoiseFilterConfig( + enabled=True, repetition_fraction=0.5, preserve_first_page_head=False + ) + doc = _doc( + _page_with_body([_block_at_y(header_text, y=10, height=8)], page_number=1), + _page_with_body([_block_at_y(header_text, y=10, height=8)], page_number=2), + _page_with_body([_block_at_y(header_text, y=10, height=8)], page_number=3), + ) + with_preserve = get_noise_blocks(doc, config_preserve) + without_preserve = get_noise_blocks(doc, config_no_preserve) + filtered_ids_preserve = {id(nb.block) for nb in with_preserve} + assert id(doc.pages[0].blocks[0]) not in filtered_ids_preserve + assert len(with_preserve) == 2 # pages 2-3 only + assert len(without_preserve) == 3 # all three pages + + def test_preserve_first_page_foot_false_filters_page_1_footer(self): + # By default, preserve_first_page_foot=False so page-1 footers are filtered. + footer_text = 'Journal Name Vol 1' + doc = _doc( + _page_with_body([_block_at_y(footer_text, y=950, height=8)], page_number=1), + _page_with_body([_block_at_y(footer_text, y=950, height=8)], page_number=2), + _page_with_body([_block_at_y(footer_text, y=950, height=8)], page_number=3), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert all(nb.note_type == 'running-foot' for nb in result) + assert len(result) == 3 # page 1 footer filtered too + class TestRemoveNoiseBlocks: def test_returns_same_document_when_no_noise(self):