diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 246b2d48..22a2e91a 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -28,6 +28,7 @@ public bool GetIsTokenized(int i) => public IReadOnlyList AllRowsList { get; set; } public IReadOnlyList Corpora { get; } + public IAlignmentCorpus AlignmentCorpus { get; set; } public IComparer RowRefComparer { get; } private static HashSet GetTextIdsFromCorpora( @@ -63,8 +64,9 @@ public override IEnumerable GetRows(IEnumerable textId if (textIds != null) filterTextIds.IntersectWith(textIds); + IEnumerator alignmentEnumerator = null; IList> enumeratedCorpora = new List>(); - IEnumerable ret = new List() { }; + IEnumerable rows = new List() { }; try { for (int i = 0; i < Corpora.Count; i++) @@ -74,7 +76,10 @@ public override IEnumerable GetRows(IEnumerable textId new TextCorpusEnumerator(enumerator, Corpora[0].Versification, Corpora[i].Versification) ); } - ret = GetRows(enumeratedCorpora).ToList(); //TODO cleanup + + if (AlignmentCorpus != null) + alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator(); + rows = GetRows(enumeratedCorpora, alignmentEnumerator).ToList(); } finally { @@ -82,14 +87,14 @@ public override IEnumerable GetRows(IEnumerable textId { enumerator.Dispose(); } + alignmentEnumerator?.Dispose(); } - return ret; + return rows; } - private bool AnyInRangeWithSegments(IList rows) + private bool AllInRangeHaveSegments(IList rows) { - return (rows.Any(r => r.IsInRange && r.Segment.Count > 0) && rows.Any(r => !r.IsInRange)) - || rows.All(r => r.IsInRange && r.Segment.Count > 0); + return rows.All(r => (r.IsInRange && r.Segment.Count > 0) || (!r.IsInRange)); } private IList MinRefIndexes(IList refs) @@ -112,7 +117,10 @@ private IList MinRefIndexes(IList refs) return minRefIndexes; } - private IEnumerable GetRows(IList> listOfEnumerators) + private IEnumerable GetRows( + IList> listOfEnumerators, + IEnumerator alignmentEnumerator + ) { { var rangeInfo = new NRangeInfo(N) @@ -178,17 +186,14 @@ private IEnumerable GetRows(IList> listOf foreach ( NParallelTextRow row in CreateMinRefRows( rangeInfo, - currentRows, - minRefIndexes, - nonMinRefIndexes, + currentRows.ToArray(), + minRefIndexes.ToArray(), + nonMinRefIndexes.ToArray(), forceInRange: minRefIndexes .Select(i => - nonMinEnumerators - .Where(e => e.Current != null) - .Select(e => e.Current.TextId) - .Union(new List { currentRows[i].TextId }) - .Distinct() - .Count() == 1 //TODO clean up + nonMinEnumerators.All(e => + e.Current != null && e.Current.TextId == currentRows[i].TextId + ) && nonMinEnumerators .Where(e => e.Current != null) .Select(e => !e.Current.IsRangeStart && e.Current.IsInRange) @@ -210,18 +215,39 @@ NParallelTextRow row in CreateMinRefRows( else if (minRefIndexes.Count == (N - completed.Count(c => c))) // the refs are all the same { + int compareAlignmentCorpus = -1; + if (AlignmentCorpus != null) + { + do + { + try + { + compareAlignmentCorpus = alignmentEnumerator.MoveNext() + ? RowRefComparer.Compare( + currentIncompleteRows[0].Ref, + alignmentEnumerator.Current.Ref + ) + : 1; + } + catch (ArgumentException) + { + throw new CorpusAlignmentException( + currentRows.Select(e => e.Ref.ToString()).ToArray() + ); + } + } while (compareAlignmentCorpus < 0); + } + if ( minRefIndexes .Select(i => - !AllRowsList[i] - && minRefIndexes - .Where(j => j != i && !completed[j] && listOfEnumerators[j].Current.IsInRange) - .Any() + listOfEnumerators[i].Current.IsInRange + && minRefIndexes.All(j => j == i || !AllRowsList[j]) ) .Any(b => b) ) { - if (rangeInfo.IsInRange && AnyInRangeWithSegments(currentIncompleteRows)) + if (rangeInfo.IsInRange && AllInRangeHaveSegments(currentIncompleteRows)) { yield return rangeInfo.CreateRow(); } @@ -256,7 +282,15 @@ NParallelTextRow row in CreateMinRefRows( } } } - foreach (NParallelTextRow row in CreateRows(rangeInfo, currentIncompleteRows)) + foreach ( + NParallelTextRow row in CreateRows( + rangeInfo, + currentIncompleteRows, + alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 + ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() + : null + ) + ) { yield return row; } @@ -281,7 +315,7 @@ NParallelTextRow row in CreateMinRefRows( } } - private object[] UnifyVersification(object[] refs, int i) + private object[] CorrectVersification(object[] refs, int i) { if (Corpora.Any(c => c.Versification == null) || refs.Length == 0) return refs; @@ -293,8 +327,9 @@ private object[] UnifyVersification(object[] refs, int i) private IEnumerable CreateRows( NRangeInfo rangeInfo, - IList rows, - IList forceInRange = null + IReadOnlyList rows, + IReadOnlyList forceInRange = null, + IReadOnlyList alignedWordPairs = null ) { if (rangeInfo.IsInRange) @@ -313,13 +348,13 @@ private IEnumerable CreateRows( { textId = textId ?? rows[i]?.TextId; refs.Add( - UnifyVersification(rows[i].Ref == null ? new object[] { } : new object[] { rows[i].Ref }, i) + CorrectVersification(rows[i].Ref == null ? new object[] { } : new object[] { rows[i].Ref }, i) ); flags.Add(rows[i].Flags); } else { - refs.Add(new object[] { }); + refs.Add(CorrectVersification(refRefs, i)); flags.Add(forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); } } @@ -327,16 +362,17 @@ private IEnumerable CreateRows( yield return new NParallelTextRow(textId, refs) { NSegments = rows.Select(r => r?.Segment ?? Array.Empty()).ToArray(), - NFlags = flags.ToReadOnlyList() + NFlags = flags.ToReadOnlyList(), + AlignedWordPairs = alignedWordPairs }; } private IEnumerable CreateMinRefRows( NRangeInfo rangeInfo, - IList currentRows, - IList minRefIndexes, - IList nonMinRefIndexes, - IList forceInRange = null + IReadOnlyList currentRows, + IReadOnlyList minRefIndexes, + IReadOnlyList nonMinRefIndexes, + IReadOnlyList forceInRange = null ) { List<(IList Rows, int Index)> sameRefRowsPerIndex = nonMinRefIndexes diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index 146ba600..cc04b52e 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -46,6 +46,8 @@ public bool GetIsRangeStart(int i) => public string GetText(int i) => string.Join(" ", NSegments[i]); + public IReadOnlyCollection AlignedWordPairs { get; set; } + public NParallelTextRow Invert() { return new NParallelTextRow(TextId, NRefs.Reverse()) { NFlags = NFlags.Reverse().ToImmutableArray(), }; diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index 9b9f668e..e015dc47 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -20,7 +20,10 @@ public ParallelTextCorpus( TargetCorpus = targetCorpus; AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus(); RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); - NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }); + NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }) + { + AlignmentCorpus = AlignmentCorpus + }; } public override bool IsSourceTokenized => SourceCorpus.IsTokenized; @@ -45,19 +48,13 @@ public override IEnumerable GetRows(IEnumerable textIds foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) { - bool hasTarget = nRow.N > 1; - if (!hasTarget && !AllTargetRows) - continue; - yield return new ParallelTextRow( - nRow.TextId, - nRow.NRefs[0], - hasTarget ? nRow.NRefs[1] : new object[] { } - ) + yield return new ParallelTextRow(nRow.TextId, nRow.NRefs[0], nRow.NRefs[1]) { SourceFlags = nRow.NFlags[0], - TargetFlags = hasTarget ? nRow.NFlags[1] : new TextRowFlags(), + TargetFlags = nRow.NFlags[1], SourceSegment = nRow.NSegments[0], - TargetSegment = hasTarget ? nRow.NSegments[1] : new string[] { } + TargetSegment = nRow.NSegments[1], + AlignedWordPairs = nRow.AlignedWordPairs }; } } diff --git a/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs new file mode 100644 index 00000000..cb1b4f4b --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs @@ -0,0 +1,542 @@ +using System.Text.Json; +using NUnit.Framework; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class NParallelTextCorpusTests +{ + [Test] + public void GetRows_ThreeCorpora() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 1)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 1 .".Split()))); + Assert.That(rows[0].GetIsSentenceStart(0), Is.False); + Assert.That(rows[0].GetIsSentenceStart(1), Is.True); + Assert.That(rows[2].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[2].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[2].GetIsSentenceStart(1), Is.False); + Assert.That(rows[2].GetIsSentenceStart(2), Is.True); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(1)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[0].GetIsSentenceStart(0), Is.True); + Assert.That(rows[0].GetIsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_AllAllRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[2].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[2].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[2].GetIsSentenceStart(0), Is.True); + Assert.That(rows[2].GetIsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_SomeAllRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [true, false, true] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2)); + Assert.That(rows[1].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[1].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[1].GetIsSentenceStart(0), Is.True); + Assert.That(rows[1].GetIsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_MissingLastRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source segment 1 ."), }) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source segment 1 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [true, false, false] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2)); + Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split()))); + Assert.That(rows[1].GetIsSentenceStart(0), Is.True); + } + + [Test] + public void GetRows_OneCorpus() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1]) { AllRowsList = [true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 1)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 1 .".Split()))); + Assert.That(rows[0].GetIsSentenceStart(0), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_Range() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 ."), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 ."), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.SequenceEqual([2, 3]))); + Assert.That(rows[1].NSegments[0], Is.EqualTo("source segment 2 . source segment 3 .".Split())); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(1), JsonSerializer.Serialize(rows)); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllIndividualRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [false, false, true] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1 })); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllRangeOneThroughTwoRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [false, true, false] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1, 2 })); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllRangeTwoThroughThreeRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [true, false, false] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1 })); + } + + [Test] + public void GetRows_ThreeCorpora_SameRefManyToMany() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(10)); + } + + private static TextRow TextRow( + string textId, + object rowRef, + string text = "", + TextRowFlags flags = TextRowFlags.SentenceStart + ) + { + return new TextRow(textId, rowRef) + { + Segment = text.Length == 0 ? Array.Empty() : text.Split(), + Flags = flags + }; + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs index dd1895a3..d40529c6 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs @@ -61,14 +61,14 @@ public void GetRows_NoMissingRows() Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); Assert.That(rows[0].IsSourceSentenceStart, Is.False); Assert.That(rows[0].IsTargetSentenceStart, Is.True); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[2].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[2].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[2].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[2].TargetSegment, Is.EqualTo("target segment 3 .".Split())); Assert.That(rows[2].IsSourceSentenceStart, Is.True); Assert.That(rows[2].IsTargetSentenceStart, Is.False); - // Assert.That(rows[2].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + Assert.That(rows[2].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -109,12 +109,12 @@ public void GetRows_MissingMiddleTargetRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -155,12 +155,12 @@ public void GetRows_MissingMiddleSourceRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -201,12 +201,12 @@ public void GetRows_MissingLastTargetRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); } [Test] @@ -247,12 +247,12 @@ public void GetRows_MissingLastSourceRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); } [Test] @@ -293,12 +293,12 @@ public void GetRows_MissingFirstTargetRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -339,12 +339,12 @@ public void GetRows_MissingFirstSourceRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -590,12 +590,12 @@ public void GetRows_AllSourceRows() ParallelTextRow[] rows = parallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(7)); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 2 })); - // Assert.That(rows[1].TargetRefs, Is.Empty); + Assert.That(rows[1].TargetRefs, Is.Empty); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.Empty); Assert.That(rows[4].SourceRefs, Is.EqualTo(new[] { 5 })); - // Assert.That(rows[4].TargetRefs, Is.Empty); + Assert.That(rows[4].TargetRefs, Is.Empty); Assert.That(rows[4].SourceSegment, Is.EqualTo("source segment 5 .".Split())); Assert.That(rows[4].TargetSegment, Is.Empty); }