Skip to content

Commit

Permalink
Fix test; add corpora extensions test
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Nov 5, 2024
1 parent c3ef946 commit 282c473
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 7 deletions.
2 changes: 1 addition & 1 deletion src/SIL.Machine/Corpora/CorporaExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ public MergedCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
foreach (NParallelTextRow nRow in _corpus.GetRows())
foreach (NParallelTextRow nRow in _corpus.GetRows(textIds))
{
if (nRow.N == 0 || nRow.IsEmpty)
continue;
Expand Down
16 changes: 11 additions & 5 deletions src/SIL.Machine/Corpora/NParallelTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,10 @@ private IEnumerable<NParallelTextRow> CreateRows(
}
else
{
refs.Add(CorrectVersification(refRefs, i));
if (Corpora[i].IsScripture())
refs.Add(CorrectVersification(refRefs, i));
else
refs.Add(new object[] { });
flags.Add(forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None);
}
}
Expand All @@ -381,7 +384,7 @@ private IEnumerable<NParallelTextRow> CreateMinRefRows(
.ToList();

List<int> alreadyYielded = new List<int>();

TextRow[] textRows;
foreach (int i in minRefIndexes)
{
TextRow textRow = currentRows[i];
Expand All @@ -394,7 +397,7 @@ private IEnumerable<NParallelTextRow> CreateMinRefRows(
alreadyYielded.Add(i);
foreach (TextRow sameRefRow in sameRefRows)
{
var textRows = new TextRow[N];
textRows = new TextRow[N];
textRows[i] = textRow;
textRows[j] = sameRefRow;
foreach (
Expand All @@ -407,13 +410,16 @@ NParallelTextRow row in CreateRows(rangeInfo, textRows, forceInRange: forceInRan
}
}
}
textRows = new TextRow[N];
var forceCurrentInRange = new bool[N];
foreach (int i in minRefIndexes.Where(i => AllRowsList[i]).Except(alreadyYielded))
{
TextRow textRow = currentRows[i];
var textRows = new TextRow[N];
textRows[i] = textRow;
var forceCurrentInRange = new bool[N];
forceCurrentInRange[i] = forceCurrentInRange[i];
}
if (textRows.Any(tr => tr != null))
{
foreach (NParallelTextRow row in CreateRows(rangeInfo, textRows, forceCurrentInRange))
{
yield return row;
Expand Down
150 changes: 149 additions & 1 deletion tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using NUnit.Framework;
using System.Text.Json;
using NUnit.Framework;
using SIL.Scripture;

namespace SIL.Machine.Corpora;
Expand Down Expand Up @@ -64,4 +65,151 @@ public void ExtractScripture()
Assert.That(origRef, Is.EqualTo(new VerseRef("MAT 2:12", ScrVers.Original)));
Assert.That(corpusRef, Is.EqualTo(new VerseRef("MAT 2:12", corpus.Versification)));
}

[Test]
public void MergedCorpus_SelectFirst()
{
var corpus1 = new DictionaryTextCorpus(
new MemoryText("text1", new[] { TextRow("text1", 1, "source 1 segment 1 ."), TextRow("text1", 3) })
);
var corpus2 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source 2 segment 1 ."),
TextRow("text1", 2, "source 2 segment 2 ."),
TextRow("text1", 3)
}
)
);
var corpus3 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source 3 segment 1 ."),
TextRow("text1", 2, "source 3 segment 2 ."),
TextRow("text1", 3, "source 3 segment 3 .")
}
)
);
var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] };
var mergedCorpus = nParallelCorpus.SelectFirst();
var rows = mergedCorpus.ToArray();
Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows));
Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 ."));
Assert.That(rows[1].Text, Is.EqualTo("source 2 segment 2 ."));
Assert.That(rows[2].Text, Is.EqualTo("source 3 segment 3 ."));
}

[Test]
public void MergedCorpus_SelectRandom_Seed123456()
{
var corpus1 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source 1 segment 1 ."),
TextRow("text1", 2, "source 1 segment 2 ."),
TextRow("text1", 3, "source 1 segment 3 .")
}
)
);
var corpus2 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source 2 segment 1 ."),
TextRow("text1", 2, "source 2 segment 2 ."),
TextRow("text1", 3, "source 2 segment 3 .")
}
)
);
var corpus3 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source 3 segment 1 ."),
TextRow("text1", 2, "source 3 segment 2 ."),
TextRow("text1", 3, "source 3 segment 3 .")
}
)
);
var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] };
var mergedCorpus = nParallelCorpus.SelectRandom(123456);
var rows = mergedCorpus.ToArray();
Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows));
Assert.Multiple(() =>
{
Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 ."));
Assert.That(rows[1].Text, Is.EqualTo("source 1 segment 2 ."));
Assert.That(rows[2].Text, Is.EqualTo("source 1 segment 3 ."));
});
}

[Test]
public void MergedCorpus_SelectRandom_Seed4501()
{
var corpus1 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source 1 segment 1 ."),
TextRow("text1", 2, "source 1 segment 2 ."),
TextRow("text1", 3, "source 1 segment 3 .")
}
)
);
var corpus2 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source 2 segment 1 ."),
TextRow("text1", 2, "source 2 segment 2 ."),
TextRow("text1", 3, "source 2 segment 3 .")
}
)
);
var corpus3 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source 3 segment 1 ."),
TextRow("text1", 2, "source 3 segment 2 ."),
TextRow("text1", 3, "source 3 segment 3 .")
}
)
);
var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] };
var mergedCorpus = nParallelCorpus.SelectRandom(4501);
var rows = mergedCorpus.ToArray();
Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows));
Assert.Multiple(() =>
{
Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 ."));
Assert.That(rows[1].Text, Is.EqualTo("source 2 segment 2 ."));
Assert.That(rows[2].Text, Is.EqualTo("source 3 segment 3 ."));
});
}

private static TextRow TextRow(
string textId,
object rowRef,
string text = "",
TextRowFlags flags = TextRowFlags.SentenceStart
)
{
return new TextRow(textId, rowRef)
{
Segment = text.Length == 0 ? Array.Empty<string>() : text.Split(),
Flags = flags
};
}
}
43 changes: 43 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,49 @@ public void GetRows_ThreeCorpora_MissingRows_SomeAllRows()
Assert.That(rows[1].GetIsSentenceStart(1), Is.False);
}

[Test]
public void GetRows_ThreeCorpora_MissingRows_AllAllRows_MissingMiddle()
{
var corpus1 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source segment 1 .", TextRowFlags.None),
TextRow("text1", 3, "source segment 3 .")
}
)
);
var corpus2 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source segment 1 .", TextRowFlags.None),
TextRow("text1", 2, "source segment 2 ."),
TextRow("text1", 3, "source segment 3 .", TextRowFlags.None)
}
)
);
var corpus3 = new DictionaryTextCorpus(
new MemoryText(
"text1",
new[]
{
TextRow("text1", 1, "source segment 1 ."),
TextRow("text1", 2, "source segment 2 ."),
TextRow("text1", 3, "source segment 3 .", TextRowFlags.None)
}
)
);
var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] };
NParallelTextRow[] rows = nParallelCorpus.ToArray();
Assert.That(rows.Length, Is.EqualTo(3));
Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2));
Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split())));
Assert.That(rows[1].GetIsSentenceStart(1), Is.True);
}

[Test]
public void GetRows_ThreeCorpora_MissingRows_MissingLastRows()
{
Expand Down

0 comments on commit 282c473

Please sign in to comment.