diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index cb8c9da..6c7a893 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -65,6 +65,7 @@ from .usx_file_alignment_corpus import UsxFileAlignmentCorpus from .usx_file_text import UsxFileText from .usx_file_text_corpus import UsxFileTextCorpus +from .usx_memory_text import UsxMemoryText from .usx_zip_text import UsxZipText from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase @@ -150,6 +151,7 @@ "UsxFileAlignmentCorpus", "UsxFileText", "UsxFileTextCorpus", + "UsxMemoryText", "UsxZipText", "ZipParatextProjectSettingsParser", "ZipParatextProjectSettingsParserBase", diff --git a/machine/corpora/usx_file_alignment_collection.py b/machine/corpora/usx_file_alignment_collection.py index ec59ab3..70d115f 100644 --- a/machine/corpora/usx_file_alignment_collection.py +++ b/machine/corpora/usx_file_alignment_collection.py @@ -131,14 +131,14 @@ def _get_links(word_tokenizer: RangeTokenizer[str, int, str], tokens: Sequence[U text = "" link_strs: List[Tuple[Range[int], str]] = [] for token in tokens: - if token.para_element != prev_para_elem and len(text) > 0: + if token.parent_element != prev_para_elem and len(text) > 0: text += " " start = len(text) text += str(token) if token.element is not None and token.element.tag == "wg": link_strs.append((Range.create(start, len(text)), token.element.get("target_links", ""))) - prev_para_elem = token.para_element + prev_para_elem = token.parent_element text = text.strip() i = 0 diff --git a/machine/corpora/usx_memory_text.py b/machine/corpora/usx_memory_text.py new file mode 100644 index 0000000..0bfd242 --- /dev/null +++ b/machine/corpora/usx_memory_text.py @@ -0,0 +1,15 @@ +from typing import Optional + +from ..scripture.verse_ref import Versification +from .memory_stream_container import MemoryStreamContainer +from .stream_container import StreamContainer +from .usx_text_base import UsxTextBase + + +class UsxMemoryText(UsxTextBase): + def __init__(self, id: str, usx: str, versification: Optional[Versification] = None) -> None: + super().__init__(id, versification) + self._usx = usx + + def _create_stream_container(self) -> StreamContainer: + return MemoryStreamContainer(self._usx) diff --git a/machine/corpora/usx_token.py b/machine/corpora/usx_token.py index d7dfffd..61e0fd0 100644 --- a/machine/corpora/usx_token.py +++ b/machine/corpora/usx_token.py @@ -5,7 +5,7 @@ @dataclass(frozen=True) class UsxToken: - para_element: ElementTree.Element + parent_element: ElementTree.Element text: str element: Optional[ElementTree.Element] diff --git a/machine/corpora/usx_verse.py b/machine/corpora/usx_verse.py index 5f78e1e..6a17da8 100644 --- a/machine/corpora/usx_verse.py +++ b/machine/corpora/usx_verse.py @@ -25,7 +25,7 @@ def __init__(self, chapter: str, verse: str, is_sentence_start: bool, tokens: It if ( prev_token is not None - and token.para_element != prev_token.para_element + and token.parent_element != prev_token.parent_element and len(text) > 0 and not ends_with_space ): diff --git a/machine/corpora/usx_verse_parser.py b/machine/corpora/usx_verse_parser.py index c857b20..e7f8e70 100644 --- a/machine/corpora/usx_verse_parser.py +++ b/machine/corpora/usx_verse_parser.py @@ -1,11 +1,12 @@ from __future__ import annotations +import string from dataclasses import dataclass, field from typing import BinaryIO, Iterable, List, Optional from xml.etree import ElementTree from ..scripture.verse_ref import are_overlapping_verse_ranges -from ..utils.string_utils import has_sentence_ending, is_integer +from ..utils.string_utils import has_sentence_ending from .corpora_utils import merge_verse_ranges from .usx_token import UsxToken from .usx_verse import UsxVerse @@ -22,6 +23,7 @@ def parse(self, stream: BinaryIO) -> Iterable[UsxVerse]: if root_elem is None: raise RuntimeError("USX does not contain a book element.") assert root_elem is not None + ctxt.parent_element = root_elem for verse in self._parse_element(root_elem, ctxt): yield verse @@ -42,7 +44,7 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter if not _is_verse_para(e): ctxt.is_sentence_start = True continue - ctxt.para_element = e + ctxt.parent_element = e for evt in self._parse_element(e, ctxt): yield evt elif e.tag == "verse": @@ -81,30 +83,61 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter elif e.tag == "figure": if ctxt.chapter is not None and ctxt.verse is not None: ctxt.add_token("", e) + elif e.tag == "table": + for evt in self._parse_element(e, ctxt): + yield evt + elif e.tag == "row": + for evt in self._parse_element(e, ctxt): + yield evt + elif e.tag == "cell": + ctxt.parent_element = e + for evt in self._parse_element(e, ctxt): + yield evt if e.tail is not None and ctxt.chapter is not None and ctxt.verse is not None: ctxt.add_token(e.tail) -_NONVERSE_PARA_STYLES = {"ms", "mr", "s", "sr", "r", "d", "sp", "rem", "restore", "cl"} - - -def _is_numbered_style(style_prefix: str, style: str) -> bool: - return style.startswith(style_prefix) and is_integer(style[len(style_prefix) :]) +_VERSE_PARA_STYLES = { + # Paragraphs + "p", + "m", + "po", + "pr", + "cls", + "pmo", + "pm", + "pmc", + "pmr", + "pi", + "pc", + "mi", + "nb", + # Poetry + "q", + "qc", + "qr", + "qm", + "qd", + "b", + "d", + # Lists + "lh", + "li", + "lf", + "lim", + # Deprecated + "ph", + "phi", + "ps", + "psi", +} def _is_verse_para(para_elem: ElementTree.Element) -> bool: style = para_elem.get("style", "") - if style in _NONVERSE_PARA_STYLES: - return False - - if _is_numbered_style("ms", style): - return False - - if _is_numbered_style("s", style): - return False - - return True + style = style.rstrip(string.digits) + return style in _VERSE_PARA_STYLES @dataclass @@ -112,12 +145,12 @@ class _ParseContext: chapter: Optional[str] = None verse: Optional[str] = None is_sentence_start: bool = True - para_element: Optional[ElementTree.Element] = None + parent_element: Optional[ElementTree.Element] = None _verse_tokens: List[UsxToken] = field(default_factory=list) def add_token(self, text: str, elem: Optional[ElementTree.Element] = None) -> None: - assert self.para_element is not None - self._verse_tokens.append(UsxToken(self.para_element, text, elem)) + assert self.parent_element is not None + self._verse_tokens.append(UsxToken(self.parent_element, text, elem)) def create_verse(self) -> UsxVerse: assert self.chapter is not None and self.verse is not None diff --git a/tests/corpora/test_usx_memory_text.py b/tests/corpora/test_usx_memory_text.py new file mode 100644 index 0000000..18d2532 --- /dev/null +++ b/tests/corpora/test_usx_memory_text.py @@ -0,0 +1,59 @@ +from typing import List + +from testutils.corpora_test_helpers import scripture_ref + +from machine.corpora import ScriptureRef, TextRow, UsxMemoryText + + +def test_get_rows_descriptive_title() -> None: + rows = get_rows( + r""" + - Test + + + Descriptive title + + The rest of verse one. + This is verse two. + +""" + ) + assert len(rows) == 2 + + assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1"), str.join(",", [str(tr.ref) for tr in rows]) + assert rows[0].text == "Descriptive title", str.join(",", [tr.text for tr in rows]) + + +def test_get_rows_table() -> None: + rows = get_rows( + r""" + - Test + + + + Chapter + 1 + verse + 1 + + + + Chapter 1 verse 2 + +
+
+""" + ) + + assert len(rows) == 2 + + assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1") + assert rows[0].text == "Chapter 1 verse 1" + + assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:2") + assert rows[1].text == "Chapter 1 verse 2" + + +def get_rows(usx: str) -> List[TextRow]: + text = UsxMemoryText("MAT", usx) + return list(text.get_rows())