Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Temporary workaround for IfPartPattern in reviewer and other fixes #362

Closed
wants to merge 52 commits into from
Closed
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
eeabf2f
fix off-by-one error in FailureStoreReport
rkm Jul 11, 2023
537a1ad
Expand Failure detail shown in RulesView
rkm Jul 12, 2023
cc7fb9e
adjust ii window sizing
rkm Jul 12, 2023
6c2d602
fix count of outstanding failures and adjust node display
rkm Jul 12, 2023
df6bbc4
fix scroll past end of report count
rkm Jul 12, 2023
52dc51d
upload ii binary package on each CI run
rkm Jul 12, 2023
f99bfda
add names to uploaded artifacts
rkm Jul 12, 2023
71a94fa
exit early if file not found
rkm Jul 13, 2023
a80c2f3
implement ToString for FailurePart
rkm Jul 13, 2023
7815b98
Tidy FailureView.Redraw
rkm Jul 25, 2023
6df0217
add temporary ability to use PartRegexRule in reviewer
rkm Jul 25, 2023
a7a9c25
fixup test PackageReferences
rkm Jul 25, 2023
b94b8e7
don't emit a Failure if there are no parts remaining
rkm Jul 25, 2023
fa2986e
fix ii opening any CSV and silently ignoring errors
rkm Aug 7, 2023
015a39b
check IfPartPattern is properly enclosed
rkm Aug 7, 2023
a586749
support words before and after the FailurePart
rkm Aug 7, 2023
024ea22
fixup offset issues
rkm Aug 7, 2023
3ec7354
pull-up offset fixing code to cover all parts
rkm Aug 8, 2023
6ac9554
rename PartRegexRuleTemp -> PartPatternFilterRule
rkm Aug 9, 2023
2c8d561
add tests for some failures cases
rkm Aug 9, 2023
f1633b6
fixup handling of offsets for pixel data
rkm Aug 10, 2023
c793135
avoid multiple enumeration issue
rkm Aug 10, 2023
a842760
fixup offset in test
rkm Aug 10, 2023
ee62305
clarify field names
rkm Aug 23, 2023
9283d69
support hyphenated words in WordBefore/WordAfter
rkm Aug 23, 2023
e3b7ea2
use literal strings
rkm Aug 23, 2023
b2b3d6e
support any classification in PartPatternFilterRule
rkm Aug 23, 2023
a421696
tidy test naming
rkm Aug 23, 2023
2e5ce09
show error when opening initial report fails
rkm Aug 23, 2023
03b435c
load failures in parallel
rkm Aug 23, 2023
7dbe860
cache regexes
rkm Aug 24, 2023
03f0769
ensure timer is stopped before disposing
rkm Aug 24, 2023
c26ea2a
add extra debug for invalid offsets
rkm Aug 24, 2023
7f5e6ab
extract offset handling to function
rkm Aug 24, 2023
e8e0d2b
handle escaped words
rkm Aug 24, 2023
7cc0145
handle additional whitespace in Failure
rkm Aug 24, 2023
74beb43
fixup previous to handle hidden unicode
rkm Aug 24, 2023
ae555ff
fix PartPatternFilterRule being double-counted in title
rkm Nov 14, 2023
9c5f439
add PartPatternFilterRules to title
rkm Nov 14, 2023
f982ac9
add used PartPatternFilterRules as tree node
rkm Nov 14, 2023
b97c47d
add column and classification to tree node
rkm Nov 14, 2023
2bb3fdf
add simple validator mode to ii
rkm Dec 7, 2023
55024e9
improve error messaging
rkm Dec 7, 2023
1b45916
wrap exception
rkm Dec 7, 2023
1de68b0
ignore first exception and continue
rkm Dec 7, 2023
ed469b6
include FailurePart in output
rkm Dec 7, 2023
dc4af8b
guard against offset starting beyond end of problem value
rkm Dec 7, 2023
100c853
fixup handling of HTML
rkm Dec 7, 2023
d4983f4
rework edge case handling
rkm Dec 7, 2023
2298707
always output
rkm Dec 7, 2023
96f4261
only show used rules
rkm Dec 7, 2023
6db3433
support exporting Outstanding Failures to file
rkm Dec 7, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions .github/workflows/dotnet-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,22 @@ jobs:
mv target/nerd-0.0.1-SNAPSHOT.jar ../../release/smi-nerd-$(fgrep AssemblyVersion ../../SharedAssemblyInfo.cs | cut -d'"' -f2).jar
- name: Package ii binary
run: |
tag="$(fgrep AssemblyVersion SharedAssemblyInfo.cs | cut -d'"' -f2)"
cd ii
dotnet publish --runtime win-x64 -c Release --self-contained true -o ../ii-win-x64
dotnet publish --runtime linux-x64 -c Release --self-contained true -o ../ii-$(fgrep AssemblyVersion ../SharedAssemblyInfo.cs|cut -d'"' -f2)-cli-linux-x64
dotnet publish --runtime linux-x64 -c Release --self-contained true -o ../ii-"$tag"-cli-linux-x64
cd ../ii-win-x64
zip -q9r ../release/ii-$(fgrep AssemblyVersion ../SharedAssemblyInfo.cs|cut -d'"' -f2)-cli-win-x64.zip .
zip -q9r ../release/ii-"$tag"-cli-win-x64.zip .
cd ..
tar -zcvf ./release/ii-$(fgrep AssemblyVersion SharedAssemblyInfo.cs|cut -d'"' -f2)-cli-linux-x64.tar.gz ii-$(fgrep AssemblyVersion SharedAssemblyInfo.cs|cut -d'"' -f2)-cli-linux-x64
tar -zcvf ./release/ii-"$tag"-cli-linux-x64.tar.gz ii-"$tag"-cli-linux-x64
git_tag="$(git rev-parse --short HEAD)"
cp ./release/ii-"$tag"-cli-linux-x64.tar.gz ./ii-"$git_tag"-cli-linux-x64.tar.gz
- name: Store created ii binary
uses: actions/upload-artifact@v3
with:
name: ii
path: "*.tar.gz"
retention-days: 1
- name: Test ii binary
run: |
set -e
Expand Down Expand Up @@ -122,6 +131,7 @@ jobs:
- name: Store created nupkg files
uses: actions/upload-artifact@v3
with:
name: IsIdentifiablePlugin
path: release/IsIdentifiablePlugin.*.nupkg
retention-days: 1
- name: Upload binaries to release
Expand Down
5 changes: 4 additions & 1 deletion IsIdentifiable/Failures/FailurePart.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Equ;
using Equ;

namespace IsIdentifiable.Failures;

Expand Down Expand Up @@ -71,4 +71,7 @@ public bool Includes(int start, int length)

return false;
}

/// <inheritdoc/>
public override string ToString() => $"{nameof(FailurePart)}({Word},{Offset},{Classification})";
}
12 changes: 10 additions & 2 deletions IsIdentifiable/Redacting/OutBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@
/// </summary>
public List<RegexRule> Rules { get; }

/// <summary>
/// Temp -- do not use.
/// </summary>
public readonly List<PartPatternFilterRule> PartRules_Temp = new();

/// <summary>
/// Persistence of <see cref="RulesFile"/>
/// </summary>
Expand Down Expand Up @@ -68,8 +73,11 @@
else
{
//populated rules file already existed
var deserializer = new Deserializer();
Rules = deserializer.Deserialize<List<RegexRule>>(existingRules) ?? new List<RegexRule>();
var builder = new DeserializerBuilder();
builder.WithTagMapping("!IgnorePartRegexRule", typeof(PartPatternFilterRule));
var allRules = builder.Build().Deserialize<List<RegexRule>>(existingRules) ?? new List<RegexRule>();
Rules = allRules.OfType<RegexRule>().ToList();
PartRules_Temp = allRules.OfType<PartPatternFilterRule>().ToList();
}
}
}
Expand All @@ -81,7 +89,7 @@
/// <param name="action"></param>
/// <param name="overrideRuleFactory">Overrides the current <see cref="RulesFactory"/> and uses this instead</param>
/// <returns>The new / existing rule that covers failure</returns>
protected RegexRule Add(Failure f, RuleAction action, IRulePatternFactory overrideRuleFactory = null)

Check warning on line 92 in IsIdentifiable/Redacting/OutBase.cs

View workflow job for this annotation

GitHub Actions / build

Cannot convert null literal to non-nullable reference type.

Check warning on line 92 in IsIdentifiable/Redacting/OutBase.cs

View workflow job for this annotation

GitHub Actions / build

Cannot convert null literal to non-nullable reference type.
{
var factory = overrideRuleFactory ?? RulesFactory;

Expand Down Expand Up @@ -175,7 +183,7 @@
/// Serializes the current <see cref="Rules"/> to the provided file
/// </summary>
/// <param name="toFile"></param>
public void Save(IFileInfo toFile = null)

Check warning on line 186 in IsIdentifiable/Redacting/OutBase.cs

View workflow job for this annotation

GitHub Actions / build

Cannot convert null literal to non-nullable reference type.

Check warning on line 186 in IsIdentifiable/Redacting/OutBase.cs

View workflow job for this annotation

GitHub Actions / build

Cannot convert null literal to non-nullable reference type.
{
toFile ??= RulesFile;

Expand Down
15 changes: 8 additions & 7 deletions IsIdentifiable/Redacting/ReportReader.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
using IsIdentifiable.Failures;
using IsIdentifiable.Reporting.Reports;
using IsIdentifiable.Rules;
using System;
using System.Collections.Generic;
using System.IO.Abstractions;
using System.Linq;
using System.Threading;
Expand Down Expand Up @@ -52,10 +54,10 @@ public ReportReader(IFileInfo csvFile)
/// <param name="loadedRows"></param>
/// <param name="token"></param>
/// <param name="fileSystem"></param>
public ReportReader(IFileInfo csvFile, Action<int> loadedRows, IFileSystem fileSystem, CancellationToken token)
public ReportReader(IFileInfo csvFile, Action<int> loadedRows, IFileSystem fileSystem, CancellationToken token, List<PartPatternFilterRule>? partRules = null)
{
var report = new FailureStoreReport("", 0, fileSystem);
Failures = FailureStoreReport.Deserialize(csvFile, loadedRows, token).ToArray();
Failures = FailureStoreReport.Deserialize(csvFile, loadedRows, token, partRules).ToArray();
}

/// <summary>
Expand All @@ -77,18 +79,17 @@ public bool Next()
/// by the total number of <see cref="Failures"/>
/// </summary>
/// <param name="index"></param>
public void GoTo(int index)
public bool GoTo(int index)
{
var original = _current;
_current = Math.Min(Math.Max(0, index), Failures.Length);
return _current != original && (_current != Failures.Length);
}

/// <summary>
/// Provides a human readable count of how far through the <see cref="Failures"/>
/// the <see cref="CurrentIndex"/> is.
/// </summary>
/// <returns></returns>
public string DescribeProgress()
{
return $"{_current}/{Failures.Length}";
}
public string DescribeProgress() => $"{_current + 1}/{Failures.Length}";
}
192 changes: 162 additions & 30 deletions IsIdentifiable/Reporting/Reports/FailureStoreReport.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
using IsIdentifiable.Failures;
using IsIdentifiable.Options;
using IsIdentifiable.Reporting.Destinations;
using IsIdentifiable.Rules;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Data;
using System.IO.Abstractions;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;

namespace IsIdentifiable.Reporting.Reports;

Expand Down Expand Up @@ -130,44 +135,171 @@
/// <param name="token">Cancellation token for aborting the file deserialication (and closing the file again)</param>
/// <returns></returns>
/// <exception cref="Exception"></exception>
public static IEnumerable<Failure> Deserialize(IFileInfo oldFile, Action<int> loadedRows, CancellationToken token)
public static IEnumerable<Failure> Deserialize(IFileInfo oldFile, Action<int> loadedRows, CancellationToken token, IEnumerable<PartPatternFilterRule>? partRules = null)
{
var lineNumber = 0;
partRules ??= new List<PartPatternFilterRule>();

using var stream = oldFile.OpenRead();
using var sr = new System.IO.StreamReader(stream);
using var r = new CsvReader(sr, System.Globalization.CultureInfo.CurrentCulture);
if (r.Read())
r.ReadHeader();
using var reader = new CsvReader(sr, System.Globalization.CultureInfo.CurrentCulture);
if (reader.Read())
reader.ReadHeader();
else
yield break;
lineNumber++;
// "Resource", "ResourcePrimaryKey", "ProblemField", "ProblemValue", "PartWords", "PartClassifications", "PartOffsets"
return Enumerable.Empty<Failure>();

while (r.Read())
{
token.ThrowIfCancellationRequested();
lineNumber++;
var words = r["PartWords"].Split(Separator);
var classes = r["PartClassifications"].Split(Separator);
var offsets = r["PartOffsets"].Split(Separator);

var parts = words.Select((t, i) => new FailurePart(
t,
Enum.TryParse<FailureClassification>(classes[i], true, out var classification) ? classification : throw new Exception($"Invalid failure classification '{classes[i]}' on line {lineNumber}"),
int.TryParse(offsets[i], out var offset) ? offset : throw new Exception($"Invalid offset '{offsets[i]}' on line {lineNumber}"))).ToList();
yield return new Failure(parts)
int totalProcessed = 0;
var localTokenSource = new CancellationTokenSource();
Fixed Show fixed Hide fixed

Check warning

Code scanning / CodeQL

Missing Dispose call on local IDisposable Warning

Disposable 'CancellationTokenSource' is created but not disposed.
using var timerTask = Task.Run(
async () =>
{
Resource = r["Resource"],
ResourcePrimaryKey = r["ResourcePrimaryKey"],
ProblemField = r["ProblemField"],
ProblemValue = r["ProblemValue"],
};

if (lineNumber % 1000 == 0)
loadedRows(lineNumber);
while (!token.IsCancellationRequested && !localTokenSource.Token.IsCancellationRequested)
{
loadedRows(totalProcessed);
await Task.Delay(TimeSpan.FromSeconds(0.1), token);
}
},
token
);

var failures = new ConcurrentBag<Failure>();

try
{
Parallel.ForEach(
reader.GetRecords<FailureStoreReportRecord>(),
new ParallelOptions
{
CancellationToken = token,
},
(FailureStoreReportRecord row) =>
{
if (row.ProblemValue == null)
throw new Exception("ProblemValue was null");

var words = row.PartWords.Split(Separator);
var classes = row.PartClassifications.Split(Separator);
var offsets = row.PartOffsets.Split(Separator);

var parts = words.Select(
(word, index) => new FailurePart(
word,
Enum.TryParse<FailureClassification>(classes[index], true, out var classification) ? classification : throw new Exception($"Invalid failure classification '{classes[index]}'"),
int.TryParse(offsets[index], out var offset) ? offset : throw new Exception($"Invalid offset '{row.PartOffsets}'")
)
).ToList();

if (row.ProblemField != "PixelData")
{
// Fixes any offsets that have been mangled by file endings etc.
foreach (var part in parts)
{
if (row.ProblemValue.Substring(part.Offset, part.Word.Length) == part.Word)
continue;

// Test if the ProblemValue has been HTML escaped
var encodedPartWord = WebUtility.HtmlEncode(part.Word);
try
{
if (row.ProblemValue.Substring(part.Offset, encodedPartWord.Length) == encodedPartWord)
{
part.Word = encodedPartWord;
continue;
}
}
catch (ArgumentOutOfRangeException)
{ }
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed

// Test if the ProblemValue has hidden unicode symbols
var withoutInvisible = Regex.Replace(row.ProblemValue, @"\p{C}+", string.Empty);
if (withoutInvisible.Substring(part.Offset, part.Word.Length) == part.Word)
{
part.Word = row.ProblemValue.Substring(part.Offset, part.Word.Length + 1);

if (row.ProblemValue.Substring(part.Offset, part.Word.Length) != part.Word)
throw new Exception($"Could not fix hidden unicode characters in Failure:\n===\n{row}\n===");

continue;
}

// Finally, try shifting the offset around to find the word
try
{
FixupOffsets(row, part);
}
catch (ArgumentOutOfRangeException e)
{
throw new Exception($"Could not fixup Offset value in Failure:\n{row}", e);
}
}
}

/* TEMP - Filter out any FailureParts covered by an PartPatternFilterRule */
var toRemove = new List<FailurePart>();
foreach (var partRule in partRules)
{
if (!string.IsNullOrWhiteSpace(partRule.IfColumn) && !string.Equals(partRule.IfColumn, row.ProblemField, StringComparison.InvariantCultureIgnoreCase))
continue;

foreach (var part in parts.Where(x => partRule.Covers(x, row.ProblemValue)))
{
toRemove.Add(part);
partRule.IncrementUsed();
}
}
parts = parts.Except(toRemove).ToList();
/* TEMP */

if (parts.Any())
failures.Add(new Failure(parts)
{
Resource = row.Resource,
ResourcePrimaryKey = row.ResourcePrimaryKey,
ProblemField = row.ProblemField,
ProblemValue = row.ProblemValue,
});

Interlocked.Increment(ref totalProcessed);
}
);
}
finally
{
localTokenSource.Cancel();
timerTask.Wait();
}

loadedRows(lineNumber);
loadedRows(totalProcessed);

return failures;
}

private static void FixupOffsets(FailureStoreReportRecord row, FailurePart part)
{
// Try looking ahead first, then back
var origOffset = part.Offset;
try
{
while (row.ProblemValue.Substring(part.Offset, part.Word.Length) != part.Word)
part.Offset++;
}
catch (ArgumentOutOfRangeException)
{
part.Offset = origOffset;
while (row.ProblemValue.Substring(part.Offset, part.Word.Length) != part.Word)
part.Offset--;
}
}

internal class FailureStoreReportRecord
{
public string Resource { get; init; }
public string ResourcePrimaryKey { get; init; }
public string ProblemField { get; init; }
public string ProblemValue { get; init; }
public string PartWords { get; init; }
public string PartClassifications { get; init; }
public string PartOffsets { get; init; }

public override string ToString() => $"Failure({Resource}|{ResourcePrimaryKey}|{ProblemField}|{ProblemValue}|{PartWords}|{PartClassifications}|{PartOffsets})";
}
}
Loading
Loading