Skip to content

Commit

Permalink
Passing tests; added alignment corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Nov 5, 2024
1 parent 6d2719f commit c3ef946
Show file tree
Hide file tree
Showing 5 changed files with 636 additions and 59 deletions.
100 changes: 68 additions & 32 deletions src/SIL.Machine/Corpora/NParallelTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ public bool GetIsTokenized(int i) =>

public IReadOnlyList<bool> AllRowsList { get; set; }
public IReadOnlyList<ITextCorpus> Corpora { get; }
public IAlignmentCorpus AlignmentCorpus { get; set; }
public IComparer<object> RowRefComparer { get; }

private static HashSet<string> GetTextIdsFromCorpora(
Expand Down Expand Up @@ -63,8 +64,9 @@ public override IEnumerable<NParallelTextRow> GetRows(IEnumerable<string> textId
if (textIds != null)
filterTextIds.IntersectWith(textIds);

IEnumerator<AlignmentRow> alignmentEnumerator = null;
IList<IEnumerator<TextRow>> enumeratedCorpora = new List<IEnumerator<TextRow>>();
IEnumerable<NParallelTextRow> ret = new List<NParallelTextRow>() { };
IEnumerable<NParallelTextRow> rows = new List<NParallelTextRow>() { };
try
{
for (int i = 0; i < Corpora.Count; i++)
Expand All @@ -74,22 +76,25 @@ public override IEnumerable<NParallelTextRow> GetRows(IEnumerable<string> textId
new TextCorpusEnumerator(enumerator, Corpora[0].Versification, Corpora[i].Versification)
);
}
ret = GetRows(enumeratedCorpora).ToList(); //TODO cleanup

if (AlignmentCorpus != null)
alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator();
rows = GetRows(enumeratedCorpora, alignmentEnumerator).ToList();
}
finally
{
foreach (IEnumerator<TextRow> enumerator in enumeratedCorpora)
{
enumerator.Dispose();
}
alignmentEnumerator?.Dispose();
}
return ret;
return rows;
}

private bool AnyInRangeWithSegments(IList<TextRow> rows)
private bool AllInRangeHaveSegments(IList<TextRow> rows)
{
return (rows.Any(r => r.IsInRange && r.Segment.Count > 0) && rows.Any(r => !r.IsInRange))
|| rows.All(r => r.IsInRange && r.Segment.Count > 0);
return rows.All(r => (r.IsInRange && r.Segment.Count > 0) || (!r.IsInRange));
}

private IList<int> MinRefIndexes(IList<object> refs)
Expand All @@ -112,7 +117,10 @@ private IList<int> MinRefIndexes(IList<object> refs)
return minRefIndexes;
}

private IEnumerable<NParallelTextRow> GetRows(IList<IEnumerator<TextRow>> listOfEnumerators)
private IEnumerable<NParallelTextRow> GetRows(
IList<IEnumerator<TextRow>> listOfEnumerators,
IEnumerator<AlignmentRow> alignmentEnumerator
)
{
{
var rangeInfo = new NRangeInfo(N)
Expand Down Expand Up @@ -178,17 +186,14 @@ private IEnumerable<NParallelTextRow> GetRows(IList<IEnumerator<TextRow>> listOf
foreach (
NParallelTextRow row in CreateMinRefRows(
rangeInfo,
currentRows,
minRefIndexes,
nonMinRefIndexes,
currentRows.ToArray(),
minRefIndexes.ToArray(),
nonMinRefIndexes.ToArray(),
forceInRange: minRefIndexes
.Select(i =>
nonMinEnumerators
.Where(e => e.Current != null)
.Select(e => e.Current.TextId)
.Union(new List<string> { currentRows[i].TextId })
.Distinct()
.Count() == 1 //TODO clean up
nonMinEnumerators.All(e =>
e.Current != null && e.Current.TextId == currentRows[i].TextId
)
&& nonMinEnumerators
.Where(e => e.Current != null)
.Select(e => !e.Current.IsRangeStart && e.Current.IsInRange)
Expand All @@ -210,18 +215,39 @@ NParallelTextRow row in CreateMinRefRows(
else if (minRefIndexes.Count == (N - completed.Count(c => c)))
// the refs are all the same
{
int compareAlignmentCorpus = -1;
if (AlignmentCorpus != null)
{
do
{
try
{
compareAlignmentCorpus = alignmentEnumerator.MoveNext()
? RowRefComparer.Compare(
currentIncompleteRows[0].Ref,
alignmentEnumerator.Current.Ref
)
: 1;
}
catch (ArgumentException)
{
throw new CorpusAlignmentException(
currentRows.Select(e => e.Ref.ToString()).ToArray()
);
}
} while (compareAlignmentCorpus < 0);
}

if (
minRefIndexes
.Select(i =>
!AllRowsList[i]
&& minRefIndexes
.Where(j => j != i && !completed[j] && listOfEnumerators[j].Current.IsInRange)
.Any()
listOfEnumerators[i].Current.IsInRange
&& minRefIndexes.All(j => j == i || !AllRowsList[j])
)
.Any(b => b)
)
{
if (rangeInfo.IsInRange && AnyInRangeWithSegments(currentIncompleteRows))
if (rangeInfo.IsInRange && AllInRangeHaveSegments(currentIncompleteRows))
{
yield return rangeInfo.CreateRow();
}
Expand Down Expand Up @@ -256,7 +282,15 @@ NParallelTextRow row in CreateMinRefRows(
}
}
}
foreach (NParallelTextRow row in CreateRows(rangeInfo, currentIncompleteRows))
foreach (
NParallelTextRow row in CreateRows(
rangeInfo,
currentIncompleteRows,
alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0
? alignmentEnumerator.Current.AlignedWordPairs.ToArray()
: null
)
)
{
yield return row;
}
Expand All @@ -281,7 +315,7 @@ NParallelTextRow row in CreateMinRefRows(
}
}

private object[] UnifyVersification(object[] refs, int i)
private object[] CorrectVersification(object[] refs, int i)
{
if (Corpora.Any(c => c.Versification == null) || refs.Length == 0)
return refs;
Expand All @@ -293,8 +327,9 @@ private object[] UnifyVersification(object[] refs, int i)

private IEnumerable<NParallelTextRow> CreateRows(
NRangeInfo rangeInfo,
IList<TextRow> rows,
IList<bool> forceInRange = null
IReadOnlyList<TextRow> rows,
IReadOnlyList<bool> forceInRange = null,
IReadOnlyList<AlignedWordPair> alignedWordPairs = null
)
{
if (rangeInfo.IsInRange)
Expand All @@ -313,30 +348,31 @@ private IEnumerable<NParallelTextRow> CreateRows(
{
textId = textId ?? rows[i]?.TextId;
refs.Add(
UnifyVersification(rows[i].Ref == null ? new object[] { } : new object[] { rows[i].Ref }, i)
CorrectVersification(rows[i].Ref == null ? new object[] { } : new object[] { rows[i].Ref }, i)
);
flags.Add(rows[i].Flags);
}
else
{
refs.Add(new object[] { });
refs.Add(CorrectVersification(refRefs, i));
flags.Add(forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None);
}
}

yield return new NParallelTextRow(textId, refs)
{
NSegments = rows.Select(r => r?.Segment ?? Array.Empty<string>()).ToArray(),
NFlags = flags.ToReadOnlyList()
NFlags = flags.ToReadOnlyList(),
AlignedWordPairs = alignedWordPairs
};
}

private IEnumerable<NParallelTextRow> CreateMinRefRows(
NRangeInfo rangeInfo,
IList<TextRow> currentRows,
IList<int> minRefIndexes,
IList<int> nonMinRefIndexes,
IList<bool> forceInRange = null
IReadOnlyList<TextRow> currentRows,
IReadOnlyList<int> minRefIndexes,
IReadOnlyList<int> nonMinRefIndexes,
IReadOnlyList<bool> forceInRange = null
)
{
List<(IList<TextRow> Rows, int Index)> sameRefRowsPerIndex = nonMinRefIndexes
Expand Down
2 changes: 2 additions & 0 deletions src/SIL.Machine/Corpora/NParallelTextRow.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ public bool GetIsRangeStart(int i) =>

public string GetText(int i) => string.Join(" ", NSegments[i]);

public IReadOnlyCollection<AlignedWordPair> AlignedWordPairs { get; set; }

public NParallelTextRow Invert()
{
return new NParallelTextRow(TextId, NRefs.Reverse()) { NFlags = NFlags.Reverse().ToImmutableArray(), };
Expand Down
19 changes: 8 additions & 11 deletions src/SIL.Machine/Corpora/ParallelTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ public ParallelTextCorpus(
TargetCorpus = targetCorpus;
AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus();
RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer();
NParallelTextCorpus = new NParallelTextCorpus(new List<ITextCorpus> { SourceCorpus, TargetCorpus });
NParallelTextCorpus = new NParallelTextCorpus(new List<ITextCorpus> { SourceCorpus, TargetCorpus })
{
AlignmentCorpus = AlignmentCorpus
};
}

public override bool IsSourceTokenized => SourceCorpus.IsTokenized;
Expand All @@ -45,19 +48,13 @@ public override IEnumerable<ParallelTextRow> GetRows(IEnumerable<string> textIds

foreach (var nRow in NParallelTextCorpus.GetRows(textIds))
{
bool hasTarget = nRow.N > 1;
if (!hasTarget && !AllTargetRows)
continue;
yield return new ParallelTextRow(
nRow.TextId,
nRow.NRefs[0],
hasTarget ? nRow.NRefs[1] : new object[] { }
)
yield return new ParallelTextRow(nRow.TextId, nRow.NRefs[0], nRow.NRefs[1])
{
SourceFlags = nRow.NFlags[0],
TargetFlags = hasTarget ? nRow.NFlags[1] : new TextRowFlags(),
TargetFlags = nRow.NFlags[1],
SourceSegment = nRow.NSegments[0],
TargetSegment = hasTarget ? nRow.NSegments[1] : new string[] { }
TargetSegment = nRow.NSegments[1],
AlignedWordPairs = nRow.AlignedWordPairs
};
}
}
Expand Down
Loading

0 comments on commit c3ef946

Please sign in to comment.