diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
index cb8c9da..6c7a893 100644
--- a/machine/corpora/__init__.py
+++ b/machine/corpora/__init__.py
@@ -65,6 +65,7 @@
from .usx_file_alignment_corpus import UsxFileAlignmentCorpus
from .usx_file_text import UsxFileText
from .usx_file_text_corpus import UsxFileTextCorpus
+from .usx_memory_text import UsxMemoryText
from .usx_zip_text import UsxZipText
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
@@ -150,6 +151,7 @@
"UsxFileAlignmentCorpus",
"UsxFileText",
"UsxFileTextCorpus",
+ "UsxMemoryText",
"UsxZipText",
"ZipParatextProjectSettingsParser",
"ZipParatextProjectSettingsParserBase",
diff --git a/machine/corpora/usx_file_alignment_collection.py b/machine/corpora/usx_file_alignment_collection.py
index ec59ab3..70d115f 100644
--- a/machine/corpora/usx_file_alignment_collection.py
+++ b/machine/corpora/usx_file_alignment_collection.py
@@ -131,14 +131,14 @@ def _get_links(word_tokenizer: RangeTokenizer[str, int, str], tokens: Sequence[U
text = ""
link_strs: List[Tuple[Range[int], str]] = []
for token in tokens:
- if token.para_element != prev_para_elem and len(text) > 0:
+ if token.parent_element != prev_para_elem and len(text) > 0:
text += " "
start = len(text)
text += str(token)
if token.element is not None and token.element.tag == "wg":
link_strs.append((Range.create(start, len(text)), token.element.get("target_links", "")))
- prev_para_elem = token.para_element
+ prev_para_elem = token.parent_element
text = text.strip()
i = 0
diff --git a/machine/corpora/usx_memory_text.py b/machine/corpora/usx_memory_text.py
new file mode 100644
index 0000000..0bfd242
--- /dev/null
+++ b/machine/corpora/usx_memory_text.py
@@ -0,0 +1,15 @@
+from typing import Optional
+
+from ..scripture.verse_ref import Versification
+from .memory_stream_container import MemoryStreamContainer
+from .stream_container import StreamContainer
+from .usx_text_base import UsxTextBase
+
+
+class UsxMemoryText(UsxTextBase):
+ def __init__(self, id: str, usx: str, versification: Optional[Versification] = None) -> None:
+ super().__init__(id, versification)
+ self._usx = usx
+
+ def _create_stream_container(self) -> StreamContainer:
+ return MemoryStreamContainer(self._usx)
diff --git a/machine/corpora/usx_token.py b/machine/corpora/usx_token.py
index d7dfffd..61e0fd0 100644
--- a/machine/corpora/usx_token.py
+++ b/machine/corpora/usx_token.py
@@ -5,7 +5,7 @@
@dataclass(frozen=True)
class UsxToken:
- para_element: ElementTree.Element
+ parent_element: ElementTree.Element
text: str
element: Optional[ElementTree.Element]
diff --git a/machine/corpora/usx_verse.py b/machine/corpora/usx_verse.py
index 5f78e1e..6a17da8 100644
--- a/machine/corpora/usx_verse.py
+++ b/machine/corpora/usx_verse.py
@@ -25,7 +25,7 @@ def __init__(self, chapter: str, verse: str, is_sentence_start: bool, tokens: It
if (
prev_token is not None
- and token.para_element != prev_token.para_element
+ and token.parent_element != prev_token.parent_element
and len(text) > 0
and not ends_with_space
):
diff --git a/machine/corpora/usx_verse_parser.py b/machine/corpora/usx_verse_parser.py
index c857b20..e7f8e70 100644
--- a/machine/corpora/usx_verse_parser.py
+++ b/machine/corpora/usx_verse_parser.py
@@ -1,11 +1,12 @@
from __future__ import annotations
+import string
from dataclasses import dataclass, field
from typing import BinaryIO, Iterable, List, Optional
from xml.etree import ElementTree
from ..scripture.verse_ref import are_overlapping_verse_ranges
-from ..utils.string_utils import has_sentence_ending, is_integer
+from ..utils.string_utils import has_sentence_ending
from .corpora_utils import merge_verse_ranges
from .usx_token import UsxToken
from .usx_verse import UsxVerse
@@ -22,6 +23,7 @@ def parse(self, stream: BinaryIO) -> Iterable[UsxVerse]:
if root_elem is None:
raise RuntimeError("USX does not contain a book element.")
assert root_elem is not None
+ ctxt.parent_element = root_elem
for verse in self._parse_element(root_elem, ctxt):
yield verse
@@ -42,7 +44,7 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter
if not _is_verse_para(e):
ctxt.is_sentence_start = True
continue
- ctxt.para_element = e
+ ctxt.parent_element = e
for evt in self._parse_element(e, ctxt):
yield evt
elif e.tag == "verse":
@@ -81,30 +83,61 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter
elif e.tag == "figure":
if ctxt.chapter is not None and ctxt.verse is not None:
ctxt.add_token("", e)
+ elif e.tag == "table":
+ for evt in self._parse_element(e, ctxt):
+ yield evt
+ elif e.tag == "row":
+ for evt in self._parse_element(e, ctxt):
+ yield evt
+ elif e.tag == "cell":
+ ctxt.parent_element = e
+ for evt in self._parse_element(e, ctxt):
+ yield evt
if e.tail is not None and ctxt.chapter is not None and ctxt.verse is not None:
ctxt.add_token(e.tail)
-_NONVERSE_PARA_STYLES = {"ms", "mr", "s", "sr", "r", "d", "sp", "rem", "restore", "cl"}
-
-
-def _is_numbered_style(style_prefix: str, style: str) -> bool:
- return style.startswith(style_prefix) and is_integer(style[len(style_prefix) :])
+_VERSE_PARA_STYLES = {
+ # Paragraphs
+ "p",
+ "m",
+ "po",
+ "pr",
+ "cls",
+ "pmo",
+ "pm",
+ "pmc",
+ "pmr",
+ "pi",
+ "pc",
+ "mi",
+ "nb",
+ # Poetry
+ "q",
+ "qc",
+ "qr",
+ "qm",
+ "qd",
+ "b",
+ "d",
+ # Lists
+ "lh",
+ "li",
+ "lf",
+ "lim",
+ # Deprecated
+ "ph",
+ "phi",
+ "ps",
+ "psi",
+}
def _is_verse_para(para_elem: ElementTree.Element) -> bool:
style = para_elem.get("style", "")
- if style in _NONVERSE_PARA_STYLES:
- return False
-
- if _is_numbered_style("ms", style):
- return False
-
- if _is_numbered_style("s", style):
- return False
-
- return True
+ style = style.rstrip(string.digits)
+ return style in _VERSE_PARA_STYLES
@dataclass
@@ -112,12 +145,12 @@ class _ParseContext:
chapter: Optional[str] = None
verse: Optional[str] = None
is_sentence_start: bool = True
- para_element: Optional[ElementTree.Element] = None
+ parent_element: Optional[ElementTree.Element] = None
_verse_tokens: List[UsxToken] = field(default_factory=list)
def add_token(self, text: str, elem: Optional[ElementTree.Element] = None) -> None:
- assert self.para_element is not None
- self._verse_tokens.append(UsxToken(self.para_element, text, elem))
+ assert self.parent_element is not None
+ self._verse_tokens.append(UsxToken(self.parent_element, text, elem))
def create_verse(self) -> UsxVerse:
assert self.chapter is not None and self.verse is not None
diff --git a/tests/corpora/test_usx_memory_text.py b/tests/corpora/test_usx_memory_text.py
new file mode 100644
index 0000000..18d2532
--- /dev/null
+++ b/tests/corpora/test_usx_memory_text.py
@@ -0,0 +1,59 @@
+from typing import List
+
+from testutils.corpora_test_helpers import scripture_ref
+
+from machine.corpora import ScriptureRef, TextRow, UsxMemoryText
+
+
+def test_get_rows_descriptive_title() -> None:
+ rows = get_rows(
+ r"""
+ - Test
+
+
+ Descriptive title
+
+ The rest of verse one.
+ This is verse two.
+
+"""
+ )
+ assert len(rows) == 2
+
+ assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1"), str.join(",", [str(tr.ref) for tr in rows])
+ assert rows[0].text == "Descriptive title", str.join(",", [tr.text for tr in rows])
+
+
+def test_get_rows_table() -> None:
+ rows = get_rows(
+ r"""
+ - Test
+
+
+
+ Chapter |
+ 1 |
+ verse |
+ 1 |
+
+
+ |
+ Chapter 1 verse 2 |
+
+
+
+"""
+ )
+
+ assert len(rows) == 2
+
+ assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1")
+ assert rows[0].text == "Chapter 1 verse 1"
+
+ assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:2")
+ assert rows[1].text == "Chapter 1 verse 2"
+
+
+def get_rows(usx: str) -> List[TextRow]:
+ text = UsxMemoryText("MAT", usx)
+ return list(text.get_rows())