diff --git a/src/lib.rs b/src/lib.rs index cd58e2e..7aef03f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1005,31 +1005,119 @@ pub fn normalize_sentence_with_options(input: &str, options: NormalizeOptions) - normalize_sentence_inner(input, max_span, options.concat_compound_numbers) } -/// Sentence-mode dispatch loop. The `concat_compound` flag is forwarded to -/// [`parse_span`] so each span sees the right tagger priorities. -fn normalize_sentence_inner(input: &str, max_span_tokens: usize, concat_compound: bool) -> String { - let trimmed = input.trim(); - if trimmed.is_empty() { - return trimmed.to_string(); +/// Per-pretoken record: the token text plus the original separator that +/// preceded it (`" "` for whitespace, `""` if the token came from a +/// punctuation split or starts the input). +struct Pretoken { + text: String, + sep: &'static str, +} + +/// ASCII punctuation characters that are split off the leading or trailing +/// edge of a whitespace-separated word so taggers see clean number phrases +/// (issue #21). Apostrophes and hyphens are intentionally excluded so +/// contractions ("don't") and hyphenated words ("twenty-one") stay intact. +fn is_split_punct(c: char) -> bool { + matches!( + c, + ',' | '.' | ';' | ':' | '!' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '"' + ) +} + +/// Split each whitespace-separated word into leading punctuation, core, and +/// trailing punctuation pretokens, preserving the original leading separator +/// on the first piece of each word so output reconstruction matches input +/// spacing. +fn pretokenize(input: &str) -> Vec { + let mut out: Vec = Vec::new(); + let mut first_word = true; + + for word in input.split_whitespace() { + let word_lead: &'static str = if first_word { "" } else { " " }; + first_word = false; + + // char_indices keeps multi-byte chars intact when slicing. + let chars: Vec<(usize, char)> = word.char_indices().collect(); + if chars.is_empty() { + continue; + } + + let mut start = 0usize; + while start < chars.len() && is_split_punct(chars[start].1) { + start += 1; + } + let mut end = chars.len(); + while end > start && is_split_punct(chars[end - 1].1) { + end -= 1; + } + + let mut next_sep = word_lead; + + // Leading punctuation: each character becomes its own pretoken. + for &(_, ch) in &chars[..start] { + out.push(Pretoken { + text: ch.to_string(), + sep: next_sep, + }); + next_sep = ""; + } + + // Core text (may be empty if the whole word was punctuation). + if start < end { + let core_start = chars[start].0; + let core_end = if end < chars.len() { + chars[end].0 + } else { + word.len() + }; + out.push(Pretoken { + text: word[core_start..core_end].to_string(), + sep: next_sep, + }); + next_sep = ""; + } + + // Trailing punctuation. + for &(_, ch) in &chars[end..] { + out.push(Pretoken { + text: ch.to_string(), + sep: next_sep, + }); + next_sep = ""; + } } + out +} + +/// Shared sliding-window match loop used by the three sentence-mode entry +/// points. `parser` returns `Some((replacement, score))` if the joined span +/// can be normalized. +fn sentence_loop(pretokens: &[Pretoken], max_span_tokens: usize, parser: F) -> String +where + F: Fn(&str) -> Option<(String, u8)>, +{ let max_span = if max_span_tokens == 0 { 1 } else { max_span_tokens }; - let tokens: Vec<&str> = trimmed.split_whitespace().collect(); - let mut out: Vec = Vec::with_capacity(tokens.len()); + + let mut out = String::new(); let mut i = 0usize; - while i < tokens.len() { - let max_end = usize::min(tokens.len(), i + max_span); + while i < pretokens.len() { + let max_end = usize::min(pretokens.len(), i + max_span); let mut best: Option<(usize, String, u8)> = None; // Longest-span-first search keeps replacements stable and non-overlapping. for end in (i + 1..=max_end).rev() { - let span = tokens[i..end].join(" "); - let Some((candidate, score)) = parse_span(&span, concat_compound) else { + let span: String = pretokens[i..end] + .iter() + .map(|p| p.text.as_str()) + .collect::>() + .join(" "); + let Some((candidate, score)) = parser(&span) else { continue; }; @@ -1056,15 +1144,31 @@ fn normalize_sentence_inner(input: &str, max_span_tokens: usize, concat_compound } if let Some((end, replacement, _)) = best { - out.push(replacement); + out.push_str(pretokens[i].sep); + out.push_str(&replacement); i = end; } else { - out.push(tokens[i].to_string()); + out.push_str(pretokens[i].sep); + out.push_str(&pretokens[i].text); i += 1; } } - out.join(" ") + out +} + +/// Sentence-mode dispatch loop. The `concat_compound` flag is forwarded to +/// [`parse_span`] so each span sees the right tagger priorities. +fn normalize_sentence_inner(input: &str, max_span_tokens: usize, concat_compound: bool) -> String { + let trimmed = input.trim(); + if trimmed.is_empty() { + return trimmed.to_string(); + } + + let pretokens = pretokenize(trimmed); + sentence_loop(&pretokens, max_span_tokens, |span| { + parse_span(span, concat_compound) + }) } // ── Text Normalization (written → spoken) ───────────────────────────── @@ -1210,56 +1314,10 @@ pub fn tn_normalize_sentence_with_max_span_lang( return trimmed.to_string(); } - let max_span = if max_span_tokens == 0 { - 1 - } else { - max_span_tokens - }; - let tokens: Vec<&str> = trimmed.split_whitespace().collect(); - let mut out: Vec = Vec::with_capacity(tokens.len()); - let mut i = 0usize; - - while i < tokens.len() { - let max_end = usize::min(tokens.len(), i + max_span); - let mut best: Option<(usize, String, u8)> = None; - - for end in (i + 1..=max_end).rev() { - let span = tokens[i..end].join(" "); - let Some((candidate, score)) = tn_parse_span_lang(&span, lang) else { - continue; - }; - - let candidate_trimmed = candidate.trim(); - if candidate_trimmed.is_empty() || candidate_trimmed == span { - continue; - } - - let candidate_len = end - i; - match &best { - None => { - best = Some((end, candidate, score)); - } - Some((best_end, _, best_score)) => { - let best_len = *best_end - i; - if candidate_len > best_len - || (candidate_len == best_len && score > *best_score) - { - best = Some((end, candidate, score)); - } - } - } - } - - if let Some((end, replacement, _)) = best { - out.push(replacement); - i = end; - } else { - out.push(tokens[i].to_string()); - i += 1; - } - } - - out.join(" ") + let pretokens = pretokenize(trimmed); + sentence_loop(&pretokens, max_span_tokens, |span| { + tn_parse_span_lang(span, lang) + }) } } } @@ -1271,56 +1329,8 @@ pub fn tn_normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) return trimmed.to_string(); } - let max_span = if max_span_tokens == 0 { - 1 - } else { - max_span_tokens - }; - let tokens: Vec<&str> = trimmed.split_whitespace().collect(); - let mut out: Vec = Vec::with_capacity(tokens.len()); - let mut i = 0usize; - - while i < tokens.len() { - let max_end = usize::min(tokens.len(), i + max_span); - let mut best: Option<(usize, String, u8)> = None; - - for end in (i + 1..=max_end).rev() { - let span = tokens[i..end].join(" "); - let Some((candidate, score)) = tn_parse_span(&span) else { - continue; - }; - - let candidate_trimmed = candidate.trim(); - if candidate_trimmed.is_empty() || candidate_trimmed == span { - continue; - } - - let candidate_len = end - i; - match &best { - None => { - best = Some((end, candidate, score)); - } - Some((best_end, _, best_score)) => { - let best_len = *best_end - i; - if candidate_len > best_len - || (candidate_len == best_len && score > *best_score) - { - best = Some((end, candidate, score)); - } - } - } - } - - if let Some((end, replacement, _)) = best { - out.push(replacement); - i = end; - } else { - out.push(tokens[i].to_string()); - i += 1; - } - } - - out.join(" ") + let pretokens = pretokenize(trimmed); + sentence_loop(&pretokens, max_span_tokens, tn_parse_span) } #[cfg(test)] diff --git a/tests/en_tests.rs b/tests/en_tests.rs index b149270..53cf871 100644 --- a/tests/en_tests.rs +++ b/tests/en_tests.rs @@ -1125,3 +1125,68 @@ fn test_issue_23_compound_concat() { "2017" ); } + +/// Issue #21: trailing punctuation glued to the last word of a number phrase +/// (e.g. `"eight,"`) used to block the cardinal tagger because +/// `split_whitespace` left punctuation attached. The pretokenizer splits +/// leading/trailing ASCII punctuation off each whitespace token while +/// preserving the original spacing on output. +#[test] +fn test_issue_21_trailing_comma() { + let opts = concat_opts(); + assert_eq!( + normalize_sentence_with_options( + "United seven eighty eight, please come up on frequency one three five point six two five, thanks.", + opts, + ), + "United 788, please come up on frequency 135.625, thanks." + ); +} + +#[test] +fn test_issue_21_trailing_period() { + let opts = concat_opts(); + assert_eq!( + normalize_sentence_with_options( + "United seven eighty eight. please come up on frequency one three five point six two five. thanks.", + opts, + ), + "United 788. please come up on frequency 135.625. thanks." + ); +} + +/// The pre-existing space-before-punctuation case must continue to work, +/// preserving the explicit space in output. +#[test] +fn test_issue_21_space_before_punct_preserved() { + let opts = concat_opts(); + assert_eq!( + normalize_sentence_with_options( + "United seven eighty eight , please come up on frequency one three five point six two five , thanks .", + opts, + ), + "United 788 , please come up on frequency 135.625 , thanks ." + ); +} + +/// Punctuation other than `,` and `.` is also split, including paired +/// brackets and quotes, while contractions and hyphenated words remain +/// intact. +#[test] +fn test_issue_21_other_punctuation() { + // Default options (no concat) still benefits from the fix. + assert_eq!( + normalize_sentence("I have twenty one apples."), + "I have 21 apples." + ); + assert_eq!( + normalize_sentence("I have (twenty one) apples!"), + "I have (21) apples!" + ); + assert_eq!(normalize_sentence("\"twenty one\" apples"), "\"21\" apples"); + // Apostrophe inside contraction is NOT split. + assert_eq!( + normalize_sentence("don't eat twenty one apples"), + "don't eat 21 apples" + ); +}