Skip to content
241 changes: 241 additions & 0 deletions sciencebeam_parser/document/layout_noise_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
import logging
import math
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple

from sciencebeam_parser.document.layout_document import (
LayoutBlock,
LayoutDocument,
LayoutPage,
)

LOGGER = logging.getLogger(__name__)


class _BlockOccurrence(NamedTuple):
page_index: int
y_relative: Optional[float]
height_relative: Optional[float]
block: LayoutBlock


@dataclass
class LayoutNoiseFilterConfig:
enabled: bool = False
repetition_fraction: float = 0.5
# Fraction of occurrences that must fall in the top/bottom quartile zone
position_consistency_fraction: float = 0.8
# Max standard deviation of y_relative across qualifying occurrences
max_position_stddev: float = 0.05
# Occurrences whose height exceeds this multiple of the group median are not filtered
# (catches e.g. a large title on page 1 that also repeats as a small footer)
max_height_ratio: float = 2.0
# Never filter running-head blocks on page 1 (the paper title lives there)
preserve_first_page_head: bool = True
# Never filter running-foot blocks on page 1 (footers have no special status on page 1)
preserve_first_page_foot: bool = False


@dataclass
class TaggedNoiseBlock:
block: LayoutBlock
note_type: str # "running-head" | "running-foot"


def _get_block_y_relative(block: LayoutBlock, page: LayoutPage) -> Optional[float]:
page_height = (
page.meta.coordinates.height
if page.meta and page.meta.coordinates
else None
)
if not page_height:
return None
y_values = [
token.coordinates.y
for token in block.iter_all_tokens()
if token.coordinates
]
if not y_values:
return None
return min(y_values) / page_height


def _get_block_height_relative(block: LayoutBlock, page: LayoutPage) -> Optional[float]:
page_height = (
page.meta.coordinates.height
if page.meta and page.meta.coordinates
else None
)
if not page_height:
return None
extents = [
(token.coordinates.y, token.coordinates.y + token.coordinates.height)
for token in block.iter_all_tokens()
if token.coordinates
]
if not extents:
return None
return (max(y1 for _, y1 in extents) - min(y0 for y0, _ in extents)) / page_height


def _stddev(values: List[float]) -> float:
mean = sum(values) / len(values)
return math.sqrt(sum((v - mean) ** 2 for v in values) / len(values))


def _compute_page_quartiles(
page_block_y_rels: Dict[int, List[float]]
) -> Tuple[Dict[int, float], Dict[int, float]]:
"""Return (q1_per_page, q3_per_page) — 25th and 75th percentile of y_relative."""
q1_map: Dict[int, float] = {}
q3_map: Dict[int, float] = {}
for page_index, ys in page_block_y_rels.items():
if not ys:
continue
sorted_ys = sorted(ys)
n = len(sorted_ys)
q1_map[page_index] = sorted_ys[n // 4]
q3_map[page_index] = sorted_ys[min(3 * n // 4, n - 1)]
return q1_map, q3_map


def _in_zone(
page_idx: int,
y_rel: float,
note_type: str,
page_q1_y: Dict[int, float],
page_q3_y: Dict[int, float],
) -> bool:
if note_type == 'running-head':
return page_idx in page_q1_y and y_rel < page_q1_y[page_idx]
return page_idx in page_q3_y and y_rel > page_q3_y[page_idx]


def _classify_repetition_group(
occurrences: List[_BlockOccurrence],
page_q1_y: Dict[int, float],
page_q3_y: Dict[int, float],
consistency_fraction: float,
max_position_stddev: float,
) -> Optional[str]:
classifiable = sum(
1 for occ in occurrences
if occ.y_relative is not None and occ.page_index in page_q1_y
)
if not classifiable:
return None
for note_type in ('running-head', 'running-foot'):
zone = [
occ for occ in occurrences
if occ.y_relative is not None
and _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y)
]
if len(zone) / classifiable < consistency_fraction:
continue
y_rels = [occ.y_relative for occ in zone if occ.y_relative is not None]
if len(y_rels) > 1 and _stddev(y_rels) > max_position_stddev:
continue
return note_type
return None


def _tag_noise_occurrences(
occurrences: List[_BlockOccurrence],
note_type: str,
page_q1_y: Dict[int, float],
page_q3_y: Dict[int, float],
max_height_ratio: float,
preserve_first_page: bool,
) -> List[TaggedNoiseBlock]:
zone_heights = [
occ.height_relative for occ in occurrences
if occ.height_relative is not None and occ.y_relative is not None
and _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y)
]
median_height = sorted(zone_heights)[len(zone_heights) // 2] if zone_heights else None
result = []
for occ in occurrences:
if preserve_first_page and occ.page_index == 0:
continue
if occ.y_relative is None:
continue
if not _in_zone(occ.page_index, occ.y_relative, note_type, page_q1_y, page_q3_y):
continue
if (median_height and occ.height_relative is not None
and occ.height_relative > max_height_ratio * median_height):
continue
result.append(TaggedNoiseBlock(block=occ.block, note_type=note_type))
return result


def _collect_blocks(
layout_document: LayoutDocument,
) -> Tuple[Dict[str, List[_BlockOccurrence]], Dict[int, List[float]]]:
text_to_occurrences: Dict[str, List[_BlockOccurrence]] = defaultdict(list)
page_block_y_rels: Dict[int, List[float]] = defaultdict(list)
for page_index, page in enumerate(layout_document.pages):
for block in page.blocks:
text = block.text.strip().casefold()
if not text:
continue
y_rel = _get_block_y_relative(block, page)
h_rel = _get_block_height_relative(block, page)
text_to_occurrences[text].append(
_BlockOccurrence(page_index, y_rel, h_rel, block)
)
if y_rel is not None:
page_block_y_rels[page_index].append(y_rel)
return dict(text_to_occurrences), dict(page_block_y_rels)


def get_noise_blocks(
layout_document: LayoutDocument,
config: LayoutNoiseFilterConfig,
) -> Sequence[TaggedNoiseBlock]:
if not config.enabled:
return []
total_pages = len(layout_document.pages)
if total_pages < 2:
return []
text_to_occurrences, page_block_y_rels = _collect_blocks(layout_document)
page_q1_y, page_q3_y = _compute_page_quartiles(page_block_y_rels)
threshold = max(2.0, config.repetition_fraction * total_pages)
noise_blocks: List[TaggedNoiseBlock] = []
for _text, occurrences in text_to_occurrences.items():
if len(occurrences) < threshold:
continue
note_type = _classify_repetition_group(
occurrences, page_q1_y, page_q3_y,
config.position_consistency_fraction,
config.max_position_stddev,
)
if not note_type:
continue
preserve = (
config.preserve_first_page_head
if note_type == 'running-head'
else config.preserve_first_page_foot
)
noise_blocks.extend(_tag_noise_occurrences(
occurrences, note_type, page_q1_y, page_q3_y,
config.max_height_ratio, preserve
))
LOGGER.debug('found %d layout noise blocks', len(noise_blocks))
return noise_blocks


def remove_noise_blocks(
layout_document: LayoutDocument,
noise_blocks: Sequence[TaggedNoiseBlock],
) -> LayoutDocument:
if not noise_blocks:
return layout_document
excluded_ids: Set[int] = {id(nb.block) for nb in noise_blocks}
return LayoutDocument(pages=[
page.replace(blocks=[
block for block in page.blocks
if id(block) not in excluded_ids
])
for page in layout_document.pages
])
4 changes: 4 additions & 0 deletions sciencebeam_parser/processors/fulltext/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ class FullTextProcessorConfig(NamedTuple):
use_ocr_model: bool = False
replace_text_by_cv_graphic: bool = False
max_graphic_distance: float = DEFAULT_MAX_GRAPHIC_DISTANCE
noise_filter_enabled: bool = False
noise_filter_repetition_fraction: float = 0.5
noise_filter_preserve_first_page_head: bool = True
noise_filter_preserve_first_page_foot: bool = False

@staticmethod
def from_app_config(app_config: AppConfig) -> 'FullTextProcessorConfig':
Expand Down
23 changes: 22 additions & 1 deletion sciencebeam_parser/processors/fulltext/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
SemanticLabel,
SemanticMixedContentWrapper,
SemanticMixedNote,
SemanticNote,
SemanticRawAffiliationAddress,
SemanticRawAuthors,
SemanticRawEditors,
Expand All @@ -53,6 +54,11 @@
)
from sciencebeam_parser.document.tei_document import TeiDocument, get_tei_for_semantic_document
from sciencebeam_parser.document.layout_document import LayoutDocument
from sciencebeam_parser.document.layout_noise_filter import (
LayoutNoiseFilterConfig,
get_noise_blocks,
remove_noise_blocks,
)
from sciencebeam_parser.models.segmentation.model import SegmentationModel
from sciencebeam_parser.models.header.model import HeaderModel
from sciencebeam_parser.models.name.model import NameModel
Expand Down Expand Up @@ -188,8 +194,18 @@ def get_semantic_document_for_layout_document(
layout_document,
context=context
)
segmentation_label_result = self.segmentation_model.get_label_layout_document_result(
noise_blocks = get_noise_blocks(
layout_document,
LayoutNoiseFilterConfig(
enabled=self.config.noise_filter_enabled,
repetition_fraction=self.config.noise_filter_repetition_fraction,
preserve_first_page_head=self.config.noise_filter_preserve_first_page_head,
preserve_first_page_foot=self.config.noise_filter_preserve_first_page_foot,
)
)
segmentation_input = remove_noise_blocks(layout_document, noise_blocks)
segmentation_label_result = self.segmentation_model.get_label_layout_document_result(
segmentation_input,
app_features_context=self.app_features_context
)
header_layout_document = segmentation_label_result.get_filtered_document_by_label(
Expand Down Expand Up @@ -265,6 +281,11 @@ def get_semantic_document_for_layout_document(
self._assign_target_content_ids(table_citations, SimpleContentIdMatcher(
self._get_semantic_content_text_by_content_id(tables, SemanticLabel)
))
for nb in noise_blocks:
document.body_section.add_content(SemanticNote(
layout_block=nb.block,
note_type=nb.note_type
))
if self.config.extract_graphic_bounding_boxes:
self._process_graphics(
document=document,
Expand Down
4 changes: 4 additions & 0 deletions sciencebeam_parser/resources/default_config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ lookup:
- https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/names.family
processors:
fulltext:
noise_filter_enabled: true
noise_filter_repetition_fraction: 0.5
noise_filter_preserve_first_page_head: true
noise_filter_preserve_first_page_foot: false
merge_raw_authors: false
use_cv_model: false
cv_render_dpi: 100
Expand Down
Loading
Loading