From 8c586b75a1228dfab65d5423d9a21256a2714045 Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Wed, 1 Oct 2025 08:29:22 +0200 Subject: [PATCH 1/4] Make the check in md_eval_22 more permissive --- meeteval/der/md_eval.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/meeteval/der/md_eval.py b/meeteval/der/md_eval.py index f80d521..07eb682 100644 --- a/meeteval/der/md_eval.py +++ b/meeteval/der/md_eval.py @@ -316,13 +316,16 @@ def convert(string): uem, ) summary = sum(per_reco.values()) - error_rate = summary.error_rate.quantize(md_eval.error_rate) - if error_rate != md_eval.error_rate: + + # Due to floating point precision, the output of md-eval-22.pl is not + # always reproduced exactly by average across the per-recording numbers. + # Therefore, the last digit may change. + if abs(summary.error_rate - md_eval.error_rate) > 0.00007: raise RuntimeError( f'The error rate of md-eval-22.pl on all recordings ' - f'({summary.error_rate})\n' - f'does not match the average error rate of md-eval-22.pl ' - f'applied to each recording ({md_eval.error_rate}).' + f'({md_eval.error_rate})\n' + f'does not match the averaged error rate across ' + f'all sessions ({summary.error_rate}).' ) return per_reco From f0c89262d9e8014e6783900b03df9be96cfda561 Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Wed, 1 Oct 2025 08:35:09 +0200 Subject: [PATCH 2/4] Call md-eval with "-a f" option to get per-session results with a single call --- meeteval/der/md_eval.py | 99 +++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/meeteval/der/md_eval.py b/meeteval/der/md_eval.py index 07eb682..dac9132 100644 --- a/meeteval/der/md_eval.py +++ b/meeteval/der/md_eval.py @@ -191,6 +191,49 @@ def restore(self, filename): raise ValueError(f'Cannot find {filename} as value in {self.cache}') +def _parse_md_eval_22_output(output: str) -> (DiaErrorRate, dict[str, DiaErrorRate]): + """ + Parses all output blocks from the md_eval_22 output. Each block has the format: + + *** Performance analysis for Speaker Diarization for f=utt_9 *** + + SCORED SPEAKER TIME =10.160000 secs + MISSED SPEAKER TIME =0.960000 secs + FALARM SPEAKER TIME =0.000000 secs + SPEAKER ERROR TIME =0.180000 secs + OVERALL SPEAKER DIARIZATION ERROR = 11.22 percent of scored speaker time `(f=utt_9) + + Each block is parsed into a `DiaErrorRate`. Returns a `DiaErrorRate` object + for the overall error rate (named "ALL" in md-eval-22) and a dict of + `DiaErrorRate` objects for the individual files (f=???). + """ + # Pattern for each performance block + block_pattern = re.compile( + r"\*\*\* Performance analysis for Speaker Diarization for (?P[^ ]+) \*\*\*" + r".*?SCORED SPEAKER TIME\s*=\s*(?P[\d.]+)" + r".*?MISSED SPEAKER TIME\s*=\s*(?P[\d.]+)" + r".*?FALARM SPEAKER TIME\s*=\s*(?P[\d.]+)" + r".*?SPEAKER ERROR TIME\s*=\s*(?P[\d.]+)" + r".*?OVERALL SPEAKER DIARIZATION ERROR\s*=\s*(?P[\d.]+)", + re.DOTALL + ) + + results = {} + + for match in block_pattern.finditer(output): + file_name = match.group("file") + results[file_name] = DiaErrorRate( + error_rate=decimal.Decimal(match.group("error_rate")) / 100, + scored_speaker_time=decimal.Decimal(match.group("scored")), + missed_speaker_time=decimal.Decimal(match.group("missed")), + falarm_speaker_time=decimal.Decimal(match.group("falarm")), + speaker_error_time=decimal.Decimal(match.group("serror")), + ) + + summary = results.pop('ALL') + results = {k[2:]: v for k, v in results.items() if k.startswith('f=')} + + return summary, results def md_eval_22_multifile( reference, hypothesis, collar=0, regions='all', @@ -247,13 +290,13 @@ def md_eval_22_multifile( urllib.request.urlretrieve(url, md_eval_22) logging.info(f'Wrote {md_eval_22}') - warned = False - - def get_details(r, h, key, tmpdir, uem): - nonlocal warned + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) - r_file = tmpdir / f'{key}.ref.rttm' - h_file = tmpdir / f'{key}.hyp.rttm' + r = meeteval.io.RTTM([line for key in keys for line in r[key]]) + h = meeteval.io.RTTM([line for key in keys for line in h[key]]) + r_file = tmpdir / f'ref.rttm' + h_file = tmpdir / f'hyp.rttm' r.dump(r_file) h.dump(h_file) @@ -262,59 +305,25 @@ def get_details(r, h, key, tmpdir, uem): '-c', f'{collar}', '-r', f'{r_file}', '-s', f'{h_file}', + '-a', 'f', # Per-file details ] if regions == 'nooverlap': cmd.append('-1') if uem: - uem_file = tmpdir / f'{key}.uem' + uem_file = tmpdir / f'uem.uem' uem = escaper.escape_uem(uem) uem.dump(uem_file) cmd.extend(['-u', f'{uem_file}']) - elif not warned: + else: warned = True logging.warning(f'No UEM file provided. See https://github.com/fgnt/meeteval/issues/97#issuecomment-2508140402 for details.') cp = subprocess.run(cmd, stdout=subprocess.PIPE, check=True, universal_newlines=True) - # SCORED SPEAKER TIME =4309.340250 secs - # MISSED SPEAKER TIME =4309.340250 secs - # FALARM SPEAKER TIME =0.000000 secs - # SPEAKER ERROR TIME =0.000000 secs - # OVERALL SPEAKER DIARIZATION ERROR = 100.00 percent of scored speaker time `(ALL) - - error_rate, = re.findall(r'OVERALL SPEAKER DIARIZATION ERROR = ([\d.]+) percent of scored speaker time', - cp.stdout) - length, = re.findall(r'SCORED SPEAKER TIME =([\d.]+) secs', cp.stdout) - deletions, = re.findall(r'MISSED SPEAKER TIME =([\d.]+) secs', cp.stdout) - insertions, = re.findall(r'FALARM SPEAKER TIME =([\d.]+) secs', cp.stdout) - substitutions, = re.findall(r'SPEAKER ERROR TIME =([\d.]+) secs', cp.stdout) - - def convert(string): - return decimal.Decimal(string) - - return DiaErrorRate( - scored_speaker_time=convert(length), - missed_speaker_time=convert(deletions), - falarm_speaker_time=convert(insertions), - speaker_error_time=convert(substitutions), - error_rate=convert(error_rate) / 100, - ) + md_eval, per_reco = _parse_md_eval_22_output(cp.stdout) - per_reco = {} - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - for key in keys: - per_reco[escaper.restore(key)] = get_details(r[key], h[key], key, tmpdir, uem) - - md_eval = get_details( - meeteval.io.RTTM([line for key in keys for line in r[key]]), - meeteval.io.RTTM([line for key in keys for line in h[key]]), - '', - tmpdir, - uem, - ) summary = sum(per_reco.values()) # Due to floating point precision, the output of md-eval-22.pl is not @@ -328,6 +337,8 @@ def convert(string): f'all sessions ({summary.error_rate}).' ) + per_reco = {escaper.restore(k): v for k, v in per_reco.items()} + return per_reco From e36c85ba7040dee4b786d857e45d5156a1864520 Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Wed, 1 Oct 2025 08:39:31 +0200 Subject: [PATCH 3/4] Check that all keys are present in the parsed md eval output --- meeteval/der/md_eval.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/meeteval/der/md_eval.py b/meeteval/der/md_eval.py index dac9132..77d7ed1 100644 --- a/meeteval/der/md_eval.py +++ b/meeteval/der/md_eval.py @@ -324,6 +324,8 @@ def md_eval_22_multifile( md_eval, per_reco = _parse_md_eval_22_output(cp.stdout) + assert per_reco.keys() == keys, (per_reco.keys(), keys) + summary = sum(per_reco.values()) # Due to floating point precision, the output of md-eval-22.pl is not From 58cc173dd5ae2b54104ab6f52a28d4971d1ff5ed Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Wed, 1 Oct 2025 14:18:34 +0200 Subject: [PATCH 4/4] Print a warning on small errors and raise an exception for larger errors --- meeteval/der/md_eval.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/meeteval/der/md_eval.py b/meeteval/der/md_eval.py index 77d7ed1..3aee15e 100644 --- a/meeteval/der/md_eval.py +++ b/meeteval/der/md_eval.py @@ -330,13 +330,26 @@ def md_eval_22_multifile( # Due to floating point precision, the output of md-eval-22.pl is not # always reproduced exactly by average across the per-recording numbers. - # Therefore, the last digit may change. + # We'll raise an error if the difference is large and print a warning + # when it only differs slightly. if abs(summary.error_rate - md_eval.error_rate) > 0.00007: raise RuntimeError( f'The error rate of md-eval-22.pl on all recordings ' f'({md_eval.error_rate})\n' - f'does not match the averaged error rate across ' - f'all sessions ({summary.error_rate}).' + f'differs from the the averaged error rate across ' + f'all sessions ({summary.error_rate}) by more than 0.00007 ' + f'({abs(summary.error_rate - md_eval.error_rate)}.' + ) + + quantized_error_rate = summary.error_rate.quantize( + md_eval.error_rate, rounding='ROUND_HALF_UP' + ) + if quantized_error_rate != md_eval.error_rate: + logging.warning( + f'The error rate of md-eval-22.pl on all recordings ' + f'({md_eval.error_rate}) does not match the averaged error ' + f'rate across all sessions ({quantized_error_rate}). This can ' + f'happen due to floating point inaccuracies.' ) per_reco = {escaper.restore(k): v for k, v in per_reco.items()}