From e406b9f81c349b458eb66bf092ef524a5d7f9729 Mon Sep 17 00:00:00 2001 From: pajowu Date: Tue, 28 Nov 2023 00:13:30 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20DONT=5FCOMBINE=5FRES=20not?= =?UTF-8?q?=20doing=20anything?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...tence_paragraphs-leaves_special_paras.json | 167 ++++++++++++++++++ worker/tests/test_transcribe.py | 1 + .../transcribee_worker/whisper_transcribe.py | 2 +- 3 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json diff --git a/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json new file mode 100644 index 00000000..169b2b98 --- /dev/null +++ b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json @@ -0,0 +1,167 @@ +{ + "input": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "*", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "Klirren", + "start": 0.82, + "end": 1.0, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + }, + { + "text": "*", + "start": 1.0, + "end": 1.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 1.07, + "end": 1.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 1.65, + "end": 2.0, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Infos, ", + "start": 2.0, + "end": 2.07, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 2.07, + "end": 2.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 2.65, + "end": 3.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Fakten, ", + "start": 3.06, + "end": 3.09, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ], + "expected": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "*", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "Klirren", + "start": 0.82, + "end": 1.0, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + }, + { + "text": "*", + "start": 1.0, + "end": 1.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 1.07, + "end": 1.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 1.65, + "end": 2.0, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Infos, ", + "start": 2.0, + "end": 2.07, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Ich ", + "start": 2.07, + "end": 2.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 2.65, + "end": 3.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Fakten, ", + "start": 3.06, + "end": 3.09, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ] +} diff --git a/worker/tests/test_transcribe.py b/worker/tests/test_transcribe.py index 3489fb25..fa6faab0 100644 --- a/worker/tests/test_transcribe.py +++ b/worker/tests/test_transcribe.py @@ -67,6 +67,7 @@ def test_strict_sentence_paragraphs(data_file): async_doc_chain_func_to_list(strict_sentence_paragraphs)(test_data.input) ) ) + assert [x.text() for x in output] == [x.text() for x in test_data.expected] assert output == test_data.expected diff --git a/worker/transcribee_worker/whisper_transcribe.py b/worker/transcribee_worker/whisper_transcribe.py index 1e7785a9..caec0dcf 100644 --- a/worker/transcribee_worker/whisper_transcribe.py +++ b/worker/transcribee_worker/whisper_transcribe.py @@ -286,7 +286,7 @@ async def strict_sentence_paragraphs( ) acc_used_paras = [] - elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES): + if any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES): if acc_paragraph.children: yield acc_paragraph acc_paragraph = None