diff --git a/sciencebeam_parser/document/layout_noise_filter.py b/sciencebeam_parser/document/layout_noise_filter.py new file mode 100644 index 00000000..dad8c5d1 --- /dev/null +++ b/sciencebeam_parser/document/layout_noise_filter.py @@ -0,0 +1,241 @@ +import logging +import math +from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple + +from sciencebeam_parser.document.layout_document import ( + LayoutBlock, + LayoutDocument, + LayoutPage, +) + +LOGGER = logging.getLogger(__name__) + + +class _BlockOccurrence(NamedTuple): + page_index: int + y_relative: Optional[float] + height_relative: Optional[float] + block: LayoutBlock + + +@dataclass +class LayoutNoiseFilterConfig: + enabled: bool = False + repetition_fraction: float = 0.5 + # Fraction of occurrences that must fall in the top/bottom quartile zone + position_consistency_fraction: float = 0.8 + # Max standard deviation of y_relative across qualifying occurrences + max_position_stddev: float = 0.05 + # Occurrences whose height exceeds this multiple of the group median are not filtered + # (catches e.g. a large title on page 1 that also repeats as a small footer) + max_height_ratio: float = 2.0 + # Never filter running-head blocks on page 1 (the paper title lives there) + preserve_first_page_head: bool = True + # Never filter running-foot blocks on page 1 (footers have no special status on page 1) + preserve_first_page_foot: bool = False + + +@dataclass +class TaggedNoiseBlock: + block: LayoutBlock + note_type: str # "running-head" | "running-foot" + + +def _get_block_y_relative(block: LayoutBlock, page: LayoutPage) -> Optional[float]: + page_height = ( + page.meta.coordinates.height + if page.meta and page.meta.coordinates + else None + ) + if not page_height: + return None + y_values = [ + token.coordinates.y + for token in block.iter_all_tokens() + if token.coordinates + ] + if not y_values: + return None + return min(y_values) / page_height + + +def _get_block_height_relative(block: LayoutBlock, page: LayoutPage) -> Optional[float]: + page_height = ( + page.meta.coordinates.height + if page.meta and page.meta.coordinates + else None + ) + if not page_height: + return None + extents = [ + (token.coordinates.y, token.coordinates.y + token.coordinates.height) + for token in block.iter_all_tokens() + if token.coordinates + ] + if not extents: + return None + return (max(y1 for _, y1 in extents) - min(y0 for y0, _ in extents)) / page_height + + +def _stddev(values: List[float]) -> float: + mean = sum(values) / len(values) + return math.sqrt(sum((v - mean) ** 2 for v in values) / len(values)) + + +def _compute_page_quartiles( + page_block_y_rels: Dict[int, List[float]] +) -> Tuple[Dict[int, float], Dict[int, float]]: + """Return (q1_per_page, q3_per_page) — 25th and 75th percentile of y_relative.""" + q1_map: Dict[int, float] = {} + q3_map: Dict[int, float] = {} + for page_index, ys in page_block_y_rels.items(): + if not ys: + continue + sorted_ys = sorted(ys) + n = len(sorted_ys) + q1_map[page_index] = sorted_ys[n // 4] + q3_map[page_index] = sorted_ys[min(3 * n // 4, n - 1)] + return q1_map, q3_map + + +def _in_zone( + page_idx: int, + y_rel: float, + note_type: str, + page_q1_y: Dict[int, float], + page_q3_y: Dict[int, float], +) -> bool: + if note_type == 'running-head': + return page_idx in page_q1_y and y_rel < page_q1_y[page_idx] + return page_idx in page_q3_y and y_rel > page_q3_y[page_idx] + + +def _classify_repetition_group( + occurrences: List[_BlockOccurrence], + page_q1_y: Dict[int, float], + page_q3_y: Dict[int, float], + consistency_fraction: float, + max_position_stddev: float, +) -> Optional[str]: + classifiable = sum( + 1 for occ in occurrences + if occ.y_relative is not None and occ.page_index in page_q1_y + ) + if not classifiable: + return None + for note_type in ('running-head', 'running-foot'): + zone = [ + occ for occ in occurrences + if occ.y_relative is not None + and _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y) + ] + if len(zone) / classifiable < consistency_fraction: + continue + y_rels = [occ.y_relative for occ in zone if occ.y_relative is not None] + if len(y_rels) > 1 and _stddev(y_rels) > max_position_stddev: + continue + return note_type + return None + + +def _tag_noise_occurrences( + occurrences: List[_BlockOccurrence], + note_type: str, + page_q1_y: Dict[int, float], + page_q3_y: Dict[int, float], + max_height_ratio: float, + preserve_first_page: bool, +) -> List[TaggedNoiseBlock]: + zone_heights = [ + occ.height_relative for occ in occurrences + if occ.height_relative is not None and occ.y_relative is not None + and _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y) + ] + median_height = sorted(zone_heights)[len(zone_heights) // 2] if zone_heights else None + result = [] + for occ in occurrences: + if preserve_first_page and occ.page_index == 0: + continue + if occ.y_relative is None: + continue + if not _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y): + continue + if (median_height and occ.height_relative is not None + and occ.height_relative > max_height_ratio * median_height): + continue + result.append(TaggedNoiseBlock(block=occ.block, note_type=note_type)) + return result + + +def _collect_blocks( + layout_document: LayoutDocument, +) -> Tuple[Dict[str, List[_BlockOccurrence]], Dict[int, List[float]]]: + text_to_occurrences: Dict[str, List[_BlockOccurrence]] = defaultdict(list) + page_block_y_rels: Dict[int, List[float]] = defaultdict(list) + for page_index, page in enumerate(layout_document.pages): + for block in page.blocks: + text = block.text.strip().casefold() + if not text: + continue + y_rel = _get_block_y_relative(block, page) + h_rel = _get_block_height_relative(block, page) + text_to_occurrences[text].append( + _BlockOccurrence(page_index, y_rel, h_rel, block) + ) + if y_rel is not None: + page_block_y_rels[page_index].append(y_rel) + return dict(text_to_occurrences), dict(page_block_y_rels) + + +def get_noise_blocks( + layout_document: LayoutDocument, + config: LayoutNoiseFilterConfig, +) -> Sequence[TaggedNoiseBlock]: + if not config.enabled: + return [] + total_pages = len(layout_document.pages) + if total_pages < 2: + return [] + text_to_occurrences, page_block_y_rels = _collect_blocks(layout_document) + page_q1_y, page_q3_y = _compute_page_quartiles(page_block_y_rels) + threshold = max(2.0, config.repetition_fraction * total_pages) + noise_blocks: List[TaggedNoiseBlock] = [] + for _text, occurrences in text_to_occurrences.items(): + if len(occurrences) < threshold: + continue + note_type = _classify_repetition_group( + occurrences, page_q1_y, page_q3_y, + config.position_consistency_fraction, + config.max_position_stddev, + ) + if not note_type: + continue + preserve = ( + config.preserve_first_page_head + if note_type == 'running-head' + else config.preserve_first_page_foot + ) + noise_blocks.extend(_tag_noise_occurrences( + occurrences, note_type, page_q1_y, page_q3_y, + config.max_height_ratio, preserve + )) + LOGGER.debug('found %d layout noise blocks', len(noise_blocks)) + return noise_blocks + + +def remove_noise_blocks( + layout_document: LayoutDocument, + noise_blocks: Sequence[TaggedNoiseBlock], +) -> LayoutDocument: + if not noise_blocks: + return layout_document + excluded_ids: Set[int] = {id(nb.block) for nb in noise_blocks} + return LayoutDocument(pages=[ + page.replace(blocks=[ + block for block in page.blocks + if id(block) not in excluded_ids + ]) + for page in layout_document.pages + ]) diff --git a/sciencebeam_parser/processors/fulltext/config.py b/sciencebeam_parser/processors/fulltext/config.py index 27ce06f9..54f1c6b7 100644 --- a/sciencebeam_parser/processors/fulltext/config.py +++ b/sciencebeam_parser/processors/fulltext/config.py @@ -48,6 +48,10 @@ class FullTextProcessorConfig(NamedTuple): use_ocr_model: bool = False replace_text_by_cv_graphic: bool = False max_graphic_distance: float = DEFAULT_MAX_GRAPHIC_DISTANCE + noise_filter_enabled: bool = False + noise_filter_repetition_fraction: float = 0.5 + noise_filter_preserve_first_page_head: bool = True + noise_filter_preserve_first_page_foot: bool = False @staticmethod def from_app_config(app_config: AppConfig) -> 'FullTextProcessorConfig': diff --git a/sciencebeam_parser/processors/fulltext/processor.py b/sciencebeam_parser/processors/fulltext/processor.py index 01a99931..531fd8a8 100644 --- a/sciencebeam_parser/processors/fulltext/processor.py +++ b/sciencebeam_parser/processors/fulltext/processor.py @@ -33,6 +33,7 @@ SemanticLabel, SemanticMixedContentWrapper, SemanticMixedNote, + SemanticNote, SemanticRawAffiliationAddress, SemanticRawAuthors, SemanticRawEditors, @@ -53,6 +54,11 @@ ) from sciencebeam_parser.document.tei_document import TeiDocument, get_tei_for_semantic_document from sciencebeam_parser.document.layout_document import LayoutDocument +from sciencebeam_parser.document.layout_noise_filter import ( + LayoutNoiseFilterConfig, + get_noise_blocks, + remove_noise_blocks, +) from sciencebeam_parser.models.segmentation.model import SegmentationModel from sciencebeam_parser.models.header.model import HeaderModel from sciencebeam_parser.models.name.model import NameModel @@ -188,8 +194,18 @@ def get_semantic_document_for_layout_document( layout_document, context=context ) - segmentation_label_result = self.segmentation_model.get_label_layout_document_result( + noise_blocks = get_noise_blocks( layout_document, + LayoutNoiseFilterConfig( + enabled=self.config.noise_filter_enabled, + repetition_fraction=self.config.noise_filter_repetition_fraction, + preserve_first_page_head=self.config.noise_filter_preserve_first_page_head, + preserve_first_page_foot=self.config.noise_filter_preserve_first_page_foot, + ) + ) + segmentation_input = remove_noise_blocks(layout_document, noise_blocks) + segmentation_label_result = self.segmentation_model.get_label_layout_document_result( + segmentation_input, app_features_context=self.app_features_context ) header_layout_document = segmentation_label_result.get_filtered_document_by_label( @@ -265,6 +281,11 @@ def get_semantic_document_for_layout_document( self._assign_target_content_ids(table_citations, SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(tables, SemanticLabel) )) + for nb in noise_blocks: + document.body_section.add_content(SemanticNote( + layout_block=nb.block, + note_type=nb.note_type + )) if self.config.extract_graphic_bounding_boxes: self._process_graphics( document=document, diff --git a/sciencebeam_parser/resources/default_config/config.yml b/sciencebeam_parser/resources/default_config/config.yml index f1abfee7..869c52ce 100644 --- a/sciencebeam_parser/resources/default_config/config.yml +++ b/sciencebeam_parser/resources/default_config/config.yml @@ -74,6 +74,10 @@ lookup: - https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/names.family processors: fulltext: + noise_filter_enabled: true + noise_filter_repetition_fraction: 0.5 + noise_filter_preserve_first_page_head: true + noise_filter_preserve_first_page_foot: false merge_raw_authors: false use_cv_model: false cv_render_dpi: 100 diff --git a/sciencebeam_parser/service/api/routers/models.py b/sciencebeam_parser/service/api/routers/models.py index 6faa7806..e65fa8cd 100644 --- a/sciencebeam_parser/service/api/routers/models.py +++ b/sciencebeam_parser/service/api/routers/models.py @@ -21,6 +21,11 @@ normalize_layout_document ) from sciencebeam_parser.document.layout_document import LayoutDocument +from sciencebeam_parser.document.layout_noise_filter import ( + LayoutNoiseFilterConfig, + get_noise_blocks, + remove_noise_blocks, +) from sciencebeam_parser.document.semantic_document import ( SemanticMixedContentWrapper, SemanticRawAffiliationAddress, @@ -66,13 +71,15 @@ def __init__( model: Model, pdfalto_wrapper: PdfAltoWrapper, app_features_context: AppFeaturesContext, - model_name: str = 'dummy' + model_name: str = 'dummy', + noise_filter_config: Optional[LayoutNoiseFilterConfig] = None ): self.name = name self.model = model self.pdfalto_wrapper = pdfalto_wrapper self.app_features_context = app_features_context self.model_name = model_name + self.noise_filter_config = noise_filter_config def _register_feature_names_route(self, router: APIRouter) -> None: @router.get('/feature-names') @@ -110,6 +117,12 @@ def process_post( ) return router + def _apply_noise_filter(self, layout_document: LayoutDocument) -> LayoutDocument: + if not self.noise_filter_config or not self.noise_filter_config.enabled: + return layout_document + noise_blocks = get_noise_blocks(layout_document, self.noise_filter_config) + return remove_noise_blocks(layout_document, noise_blocks) + def iter_filter_layout_document( self, layout_document: LayoutDocument, @@ -140,8 +153,8 @@ def handle_post( # pylint: disable=too-many-locals xml_content = output_path.read_bytes() root = etree.fromstring(xml_content) layout_document_iterable = self.iter_filter_layout_document( - normalize_layout_document( - parse_alto_root(root) + self._apply_noise_filter( + normalize_layout_document(parse_alto_root(root)) ), filter_params=(filter_params or {}) ) @@ -520,13 +533,20 @@ def create_models_router( fulltext_models = sciencebeam_parser.fulltext_models app_features_context = sciencebeam_parser.app_features_context fulltext_processor_config = sciencebeam_parser.fulltext_processor_config + noise_filter_config = LayoutNoiseFilterConfig( + enabled=fulltext_processor_config.noise_filter_enabled, + repetition_fraction=fulltext_processor_config.noise_filter_repetition_fraction, + preserve_first_page_head=fulltext_processor_config.noise_filter_preserve_first_page_head, + preserve_first_page_foot=fulltext_processor_config.noise_filter_preserve_first_page_foot, + ) router.include_router( ModelResponseRouterFactory( 'Segmentation', model=fulltext_models.segmentation_model, pdfalto_wrapper=pdfalto_wrapper, - app_features_context=app_features_context + app_features_context=app_features_context, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/segmentation' ) @@ -538,7 +558,8 @@ def create_models_router( pdfalto_wrapper=pdfalto_wrapper, app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, - segmentation_labels=['
'] + segmentation_labels=['
'], + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/header' ) @@ -552,7 +573,8 @@ def create_models_router( segmentation_model=fulltext_models.segmentation_model, segmentation_labels=['
'], header_model=fulltext_models.header_model, - merge_raw_authors=fulltext_processor_config.merge_raw_authors + merge_raw_authors=fulltext_processor_config.merge_raw_authors, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/name-header' ) @@ -565,7 +587,8 @@ def create_models_router( app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, segmentation_labels=['
'], - header_model=fulltext_models.header_model + header_model=fulltext_models.header_model, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/affiliation-address' ) @@ -579,7 +602,8 @@ def create_models_router( pdfalto_wrapper=pdfalto_wrapper, app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, - segmentation_labels=fulltext_segmentation_labels + segmentation_labels=fulltext_segmentation_labels, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/fulltext' ) @@ -592,7 +616,8 @@ def create_models_router( app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, segmentation_labels=fulltext_segmentation_labels, - fulltext_model=fulltext_models.fulltext_model + fulltext_model=fulltext_models.fulltext_model, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/figure' ) @@ -605,7 +630,8 @@ def create_models_router( app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, segmentation_labels=fulltext_segmentation_labels, - fulltext_model=fulltext_models.fulltext_model + fulltext_model=fulltext_models.fulltext_model, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/table' ) @@ -617,7 +643,8 @@ def create_models_router( pdfalto_wrapper=pdfalto_wrapper, app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, - segmentation_labels=[''] + segmentation_labels=[''], + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/reference-segmenter' ) @@ -630,7 +657,8 @@ def create_models_router( app_features_context=app_features_context, segmentation_model=fulltext_models.segmentation_model, segmentation_labels=[''], - reference_segmenter_model=fulltext_models.reference_segmenter_model + reference_segmenter_model=fulltext_models.reference_segmenter_model, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/citation' ) @@ -644,7 +672,8 @@ def create_models_router( segmentation_model=fulltext_models.segmentation_model, segmentation_labels=[''], reference_segmenter_model=fulltext_models.reference_segmenter_model, - citation_model=fulltext_models.citation_model + citation_model=fulltext_models.citation_model, + noise_filter_config=noise_filter_config ).create_router(), prefix='/models/name-citation' ) diff --git a/tests/document/layout_noise_filter_test.py b/tests/document/layout_noise_filter_test.py new file mode 100644 index 00000000..00fe927d --- /dev/null +++ b/tests/document/layout_noise_filter_test.py @@ -0,0 +1,237 @@ +from sciencebeam_parser.document.layout_document import ( + LayoutBlock, + LayoutDocument, + LayoutLine, + LayoutPage, + LayoutPageCoordinates, + LayoutPageMeta, + LayoutToken, +) +from sciencebeam_parser.document.layout_noise_filter import ( + LayoutNoiseFilterConfig, + TaggedNoiseBlock, + get_noise_blocks, + remove_noise_blocks, +) + + +PAGE_HEIGHT = 1000 +PAGE_WIDTH = 600 +ENABLED_CONFIG = LayoutNoiseFilterConfig(enabled=True, repetition_fraction=0.5) + +# Body blocks at these y positions give a spread across the middle of the page. +# y_rels: 0.2, 0.3, 0.5, 0.6, 0.7 → q1≈0.3, q3≈0.6 when combined with a +# header at y_rel≈0.01 or footer at y_rel≈0.95. +_BODY_Y_POSITIONS = [200, 300, 500, 600, 700] + + +def _page_meta(page_number: int = 1) -> LayoutPageMeta: + return LayoutPageMeta( + page_number=page_number, + coordinates=LayoutPageCoordinates( + x=0, y=0, width=PAGE_WIDTH, height=PAGE_HEIGHT, page_number=page_number + ) + ) + + +def _block_at_y( + text: str, y: float, page_number: int = 1, height: float = 20 +) -> LayoutBlock: + token = LayoutToken( + text=text, + coordinates=LayoutPageCoordinates( + x=10, y=y, width=200, height=height, page_number=page_number + ) + ) + return LayoutBlock(lines=[LayoutLine(tokens=[token])]) + + +def _page(blocks: list, page_number: int = 1) -> LayoutPage: + return LayoutPage(blocks=blocks, meta=_page_meta(page_number)) + + +def _page_with_body(extra_blocks: list, page_number: int) -> LayoutPage: + """Build a realistic page with unique body content plus any extra blocks.""" + body_blocks = [ + _block_at_y(f'body p{page_number} i{i}', y=y, page_number=page_number) + for i, y in enumerate(_BODY_Y_POSITIONS) + ] + return _page(extra_blocks + body_blocks, page_number=page_number) + + +def _doc(*pages: LayoutPage) -> LayoutDocument: + return LayoutDocument(pages=list(pages)) + + +class TestGetNoiseBlocks: + def test_returns_empty_when_disabled(self): + doc = _doc( + _page_with_body([_block_at_y('Journal Name', y=10)], page_number=1), + _page_with_body([_block_at_y('Journal Name', y=10)], page_number=2), + ) + result = get_noise_blocks(doc, LayoutNoiseFilterConfig(enabled=False)) + assert not result + + def test_returns_empty_for_single_page(self): + doc = _doc(_page_with_body([_block_at_y('Journal Name', y=10)], page_number=1)) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert not result + + def test_detects_running_head_at_top(self): + # y=10 → y_rel=0.01, well below q1 of each page + header_text = 'Journal of Something' + doc = _doc( + _page_with_body([_block_at_y(header_text, y=10)], page_number=1), + _page_with_body([_block_at_y(header_text, y=10)], page_number=2), + _page_with_body([_block_at_y(header_text, y=10)], page_number=3), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert {nb.note_type for nb in result} == {'running-head'} + assert len(result) == 2 # page 1 preserved by default + + def test_detects_running_foot_at_bottom(self): + # y=950 → y_rel=0.95, well above q3 of each page + footer_text = 'Copyright 2024' + doc = _doc( + _page_with_body([_block_at_y(footer_text, y=950)], page_number=1), + _page_with_body([_block_at_y(footer_text, y=950)], page_number=2), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert {nb.note_type for nb in result} == {'running-foot'} + assert len(result) == 2 # page 1 footer filtered (preserve_first_page_foot=False) + + def test_does_not_flag_non_repeating_block(self): + doc = _doc( + _page_with_body([_block_at_y('Unique text page 1', y=10)], page_number=1), + _page_with_body([_block_at_y('Unique text page 2', y=10)], page_number=2), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert not result + + def test_does_not_flag_repeating_block_in_middle(self): + # y=400 → y_rel=0.4, between q1 and q3 → neither zone + mid_text = 'Section Title' + doc = _doc( + _page_with_body([_block_at_y(mid_text, y=400)], page_number=1), + _page_with_body([_block_at_y(mid_text, y=400)], page_number=2), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert not any(nb.block.text.strip() == mid_text for nb in result) + + def test_does_not_flag_block_with_inconsistent_position(self): + # Same text but moves between top and bottom across pages → high stddev → not noise + wandering_text = 'Wandering Block' + doc = _doc( + _page_with_body([_block_at_y(wandering_text, y=10)], page_number=1), + _page_with_body([_block_at_y(wandering_text, y=950)], page_number=2), + _page_with_body([_block_at_y(wandering_text, y=10)], page_number=3), + _page_with_body([_block_at_y(wandering_text, y=950)], page_number=4), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert not any(nb.block.text.strip() == wandering_text for nb in result) + + def test_respects_repetition_fraction(self): + # 2 out of 5 pages = 0.4, below default threshold of 0.5 → not flagged + header_text = 'Sparse Header' + doc = _doc( + _page_with_body([_block_at_y(header_text, y=10)], page_number=1), + _page_with_body([], page_number=2), + _page_with_body([], page_number=3), + _page_with_body([], page_number=4), + _page_with_body([_block_at_y(header_text, y=10)], page_number=5), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert not any(nb.block.text.strip() == header_text for nb in result) + + def test_case_insensitive_normalisation(self): + doc = _doc( + _page_with_body([_block_at_y('JOURNAL NAME', y=10)], page_number=1), + _page_with_body([_block_at_y('journal name', y=10)], page_number=2), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert len(result) == 1 # page 1 preserved by default + + def test_large_title_on_first_page_not_filtered_when_repeated_as_small_footer(self): + # Regression (scielo_br): a paper title appears large (height=60) near the + # top of page 1, and the same text repeats as a small footer (height=8) at + # the bottom of pages 2+. The height check must preserve the page-1 title. + title_text = 'A radicalização do debate sobre inclusão escolar no Brasil' + doc = _doc( + # page 1: large title near the top + _page_with_body([_block_at_y(title_text, y=50, height=60)], page_number=1), + # pages 2-5: same text as small footer at the bottom + _page_with_body([_block_at_y(title_text, y=950, height=8)], page_number=2), + _page_with_body([_block_at_y(title_text, y=950, height=8)], page_number=3), + _page_with_body([_block_at_y(title_text, y=950, height=8)], page_number=4), + _page_with_body([_block_at_y(title_text, y=950, height=8)], page_number=5), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + # Footer occurrences on pages 2-5 are filtered + assert all(nb.note_type == 'running-foot' for nb in result) + assert len(result) == 4 + # The large page-1 title must not be among the filtered blocks + filtered_ids = {id(nb.block) for nb in result} + page1_title_block = doc.pages[0].blocks[0] + assert id(page1_title_block) not in filtered_ids + + def test_preserve_first_page_head_protects_page_1_running_head(self): + # When height check alone is insufficient (same height on all pages), + # preserve_first_page_head=True keeps the page-1 head occurrence. + header_text = 'Running Header Same Size Everywhere' + config_preserve = LayoutNoiseFilterConfig( + enabled=True, repetition_fraction=0.5, preserve_first_page_head=True + ) + config_no_preserve = LayoutNoiseFilterConfig( + enabled=True, repetition_fraction=0.5, preserve_first_page_head=False + ) + doc = _doc( + _page_with_body([_block_at_y(header_text, y=10, height=8)], page_number=1), + _page_with_body([_block_at_y(header_text, y=10, height=8)], page_number=2), + _page_with_body([_block_at_y(header_text, y=10, height=8)], page_number=3), + ) + with_preserve = get_noise_blocks(doc, config_preserve) + without_preserve = get_noise_blocks(doc, config_no_preserve) + filtered_ids_preserve = {id(nb.block) for nb in with_preserve} + assert id(doc.pages[0].blocks[0]) not in filtered_ids_preserve + assert len(with_preserve) == 2 # pages 2-3 only + assert len(without_preserve) == 3 # all three pages + + def test_preserve_first_page_foot_false_filters_page_1_footer(self): + # By default, preserve_first_page_foot=False so page-1 footers are filtered. + footer_text = 'Journal Name Vol 1' + doc = _doc( + _page_with_body([_block_at_y(footer_text, y=950, height=8)], page_number=1), + _page_with_body([_block_at_y(footer_text, y=950, height=8)], page_number=2), + _page_with_body([_block_at_y(footer_text, y=950, height=8)], page_number=3), + ) + result = get_noise_blocks(doc, ENABLED_CONFIG) + assert all(nb.note_type == 'running-foot' for nb in result) + assert len(result) == 3 # page 1 footer filtered too + + +class TestRemoveNoiseBlocks: + def test_returns_same_document_when_no_noise(self): + doc = _doc(_page([_block_at_y('body', y=400)])) + assert remove_noise_blocks(doc, []) is doc + + def test_removes_tagged_blocks(self): + body = _block_at_y('body text', y=400) + header = _block_at_y('Journal Name', y=10) + page = _page([body, header]) + doc = _doc(page) + noise = [TaggedNoiseBlock(block=header, note_type='running-head')] + result = remove_noise_blocks(doc, noise) + remaining = list(result.iter_all_blocks()) + assert body in remaining + assert header not in remaining + + def test_preserves_page_structure(self): + b1 = _block_at_y('block 1', y=400, page_number=1) + b2 = _block_at_y('header', y=10, page_number=1) + b3 = _block_at_y('block 3', y=400, page_number=2) + doc = _doc(_page([b1, b2], page_number=1), _page([b3], page_number=2)) + noise = [TaggedNoiseBlock(block=b2, note_type='running-head')] + result = remove_noise_blocks(doc, noise) + assert len(result.pages) == 2 + assert len(result.pages[0].blocks) == 1 + assert len(result.pages[1].blocks) == 1