Skip to content

Commit

Permalink
🐛 Skip para recombination if produced para is > 30s
Browse files Browse the repository at this point in the history
  • Loading branch information
pajowu committed Nov 28, 2023
1 parent 5270089 commit e061709
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
{
"input": [
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "zum ",
"start": 0.82,
"end": 30.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
}
]
},
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "letzten ",
"start": 30.07,
"end": 31.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Token.",
"start": 31.65,
"end": 32.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
]
}
],
"expected": [
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "zum ",
"start": 0.82,
"end": 30.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
}
]
},
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "letzten ",
"start": 30.07,
"end": 31.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Token.",
"start": 31.65,
"end": 32.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
]
}
]
}
37 changes: 33 additions & 4 deletions worker/transcribee_worker/whisper_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,30 @@ async def strict_sentence_paragraphs(
iter: AsyncIterator[Paragraph],
) -> AsyncIterator[Paragraph]:
acc_paragraph = None
acc_used_paras = []
combination_active = True
async for paragraph in iter:
if acc_paragraph is None:
if not combination_active:
yield paragraph
continue
elif acc_paragraph is None:
acc_paragraph = Paragraph(
lang=paragraph.lang, speaker=paragraph.speaker, children=[]
)

acc_used_paras = []
elif (
(start := acc_paragraph.start()) is not None
and (end := paragraph.end()) is not None
and end - start > 30
):
# It seems like whisper doesn't produce sentence breaks. Ignore the
# current `acc_paragraph` and yield the original paras instead,
# disable this step until the end of the document
combination_active = False
for para in acc_used_paras:
yield para
yield paragraph
continue
elif (
acc_paragraph.lang != paragraph.lang
or acc_paragraph.speaker != paragraph.speaker
Expand All @@ -266,6 +284,7 @@ async def strict_sentence_paragraphs(
acc_paragraph = Paragraph(
lang=paragraph.lang, speaker=paragraph.speaker, children=[]
)
acc_used_paras = []

elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
if acc_paragraph.children:
Expand All @@ -285,7 +304,9 @@ async def strict_sentence_paragraphs(
acc_paragraph = Paragraph(
lang=paragraph.lang, speaker=paragraph.speaker, children=[]
)
for atom in paragraph.children:
acc_used_paras = []
acc_yield_offset = 0
for i, atom in enumerate(paragraph.children):
acc_paragraph.children.append(atom)
text = acc_paragraph.text()
if offset + len(text) in breaks and not any(
Expand All @@ -296,7 +317,15 @@ async def strict_sentence_paragraphs(
acc_paragraph = Paragraph(
lang=paragraph.lang, speaker=paragraph.speaker, children=[]
)
if acc_paragraph is not None and acc_paragraph.children:
acc_yield_offset = i
acc_used_paras.append(
Paragraph(
lang=paragraph.lang,
speaker=paragraph.speaker,
children=paragraph.children[acc_yield_offset:],
)
)
if acc_paragraph is not None and acc_paragraph.children and combination_active:
yield acc_paragraph


Expand Down

0 comments on commit e061709

Please sign in to comment.