From c34ac662c1ebd5043b53f86fd78fe330916a223b Mon Sep 17 00:00:00 2001 From: Ruairidh MacLeod Date: Thu, 7 Dec 2023 13:20:47 +0000 Subject: [PATCH] add simple validator mode to ii --- .../IsIdentifiableReportValidatorOptions.cs | 47 ++++ .../Reporting/Reports/FailureStoreReport.cs | 228 ++++++++++-------- ii/Program.cs | 30 ++- 3 files changed, 203 insertions(+), 102 deletions(-) create mode 100644 IsIdentifiable/Options/IsIdentifiableReportValidatorOptions.cs diff --git a/IsIdentifiable/Options/IsIdentifiableReportValidatorOptions.cs b/IsIdentifiable/Options/IsIdentifiableReportValidatorOptions.cs new file mode 100644 index 00000000..612d799b --- /dev/null +++ b/IsIdentifiable/Options/IsIdentifiableReportValidatorOptions.cs @@ -0,0 +1,47 @@ +using CommandLine; +using IsIdentifiable.Reporting.Reports; +using System; + +namespace IsIdentifiable.Options; + +/// +/// CLI options for the validator +/// +[Verb("validate", HelpText = "Validate a FailureStoreReport")] +public class IsIdentifiableReportValidatorOptions +{ + /// + /// The CSV list of failures to process. Must be in the format of a + /// + [Option('f', "file", + Required = true, + HelpText = "Pre load an existing failures file" + )] + public string FailuresCsv { get; set; } + + /// + /// Sets UseSystemConsole to true for Terminal.gui (i.e. uses the NetDriver which is based on System.Console) + /// + [Option("usc", HelpText = "Sets UseSystemConsole to true for Terminal.gui (i.e. uses the NetDriver which is based on System.Console)")] + public bool UseSystemConsole { get; internal set; } + + /// + /// Sets the user interface to use a specific color palette yaml file + /// + [Option("theme", HelpText = "Sets the user interface to use a specific color palette yaml file")] + public string Theme { get; set; } + + + /// + /// Populates values in this instance where no value yet exists and there is a value in + /// to inherit. + /// + /// + public virtual void InheritValuesFrom(IsIdentifiableReviewerOptions globalOpts) + { + ArgumentNullException.ThrowIfNull(globalOpts); + + if (Theme == null && !string.IsNullOrWhiteSpace(globalOpts.Theme)) + Theme = globalOpts.Theme; + } +} diff --git a/IsIdentifiable/Reporting/Reports/FailureStoreReport.cs b/IsIdentifiable/Reporting/Reports/FailureStoreReport.cs index 1af69e2a..6bab3309 100644 --- a/IsIdentifiable/Reporting/Reports/FailureStoreReport.cs +++ b/IsIdentifiable/Reporting/Reports/FailureStoreReport.cs @@ -133,9 +133,11 @@ public static IEnumerable Deserialize(IFileInfo oldFile) /// Action to call periodically as records are read from the file (for /// when the file is very big and you want to show progress etc) /// Cancellation token for aborting the file deserialication (and closing the file again) + /// + /// /// /// - public static IEnumerable Deserialize(IFileInfo oldFile, Action loadedRows, CancellationToken token, IEnumerable? partRules = null) + public static IEnumerable Deserialize(IFileInfo oldFile, Action loadedRows, CancellationToken token, IEnumerable? partRules = null, bool runParallel = true) { partRules ??= new List(); @@ -149,7 +151,11 @@ public static IEnumerable Deserialize(IFileInfo oldFile, Action lo int totalProcessed = 0; var localTokenSource = new CancellationTokenSource(); - using var timerTask = Task.Run( + var failures = new ConcurrentBag(); + + if (runParallel) + { + using var timerTask = Task.Run( async () => { while (!token.IsCancellationRequested && !localTokenSource.Token.IsCancellationRequested) @@ -159,118 +165,138 @@ public static IEnumerable Deserialize(IFileInfo oldFile, Action lo } }, token - ); - - var failures = new ConcurrentBag(); + ); - try + try + { + Parallel.ForEach( + reader.GetRecords(), + new ParallelOptions + { + CancellationToken = token, + }, + (FailureStoreReportRecord row) => Process(row, partRules, failures, ref totalProcessed) + ); + } + finally + { + localTokenSource.Cancel(); + timerTask.Wait(); + } + } + else { - Parallel.ForEach( - reader.GetRecords(), - new ParallelOptions + var problems = 0; + foreach (var row in reader.GetRecords()) + { + try { - CancellationToken = token, - }, - (FailureStoreReportRecord row) => + Process(row, partRules, failures, ref totalProcessed); + } + catch (Exception e) { - if (row.ProblemValue == null) - throw new Exception("ProblemValue was null"); - - var words = row.PartWords.Split(Separator); - var classes = row.PartClassifications.Split(Separator); - var offsets = row.PartOffsets.Split(Separator); - - var parts = words.Select( - (word, index) => new FailurePart( - word, - Enum.TryParse(classes[index], true, out var classification) ? classification : throw new Exception($"Invalid failure classification '{classes[index]}'"), - int.TryParse(offsets[index], out var offset) ? offset : throw new Exception($"Invalid offset '{row.PartOffsets}'") - ) - ).ToList(); - - if (row.ProblemField != "PixelData") - { - // Fixes any offsets that have been mangled by file endings etc. - foreach (var part in parts) - { - if (row.ProblemValue.Substring(part.Offset, part.Word.Length) == part.Word) - continue; - - // Test if the ProblemValue has been HTML escaped - var encodedPartWord = WebUtility.HtmlEncode(part.Word); - try - { - if (row.ProblemValue.Substring(part.Offset, encodedPartWord.Length) == encodedPartWord) - { - part.Word = encodedPartWord; - continue; - } - } - catch (ArgumentOutOfRangeException) - { } - - // Test if the ProblemValue has hidden unicode symbols - var withoutInvisible = Regex.Replace(row.ProblemValue, @"\p{C}+", string.Empty); - if (withoutInvisible.Substring(part.Offset, part.Word.Length) == part.Word) - { - part.Word = row.ProblemValue.Substring(part.Offset, part.Word.Length + 1); - - if (row.ProblemValue.Substring(part.Offset, part.Word.Length) != part.Word) - throw new Exception($"Could not fix hidden unicode characters in Failure:\n===\n{row}\n==="); - - continue; - } - - // Finally, try shifting the offset around to find the word - try - { - FixupOffsets(row, part); - } - catch (ArgumentOutOfRangeException e) - { - throw new Exception($"Could not fixup Offset value in Failure:\n{row}", e); - } - } - } + Console.Error.WriteLine($"{row.ResourcePrimaryKey}: {e}"); + problems++; + } + } + + if (problems > 0) + Console.Error.WriteLine($"Problem with {problems}/{totalProcessed} records"); + } + + loadedRows(totalProcessed); + + return failures; + } - /* TEMP - Filter out any FailureParts covered by an PartPatternFilterRule */ - var toRemove = new List(); - foreach (var partRule in partRules) + private static void Process(FailureStoreReportRecord row, IEnumerable? partRules, ConcurrentBag failures, ref int totalProcessed) + { + if (row.ProblemValue == null) + throw new Exception("ProblemValue was null"); + + var words = row.PartWords.Split(Separator); + var classes = row.PartClassifications.Split(Separator); + var offsets = row.PartOffsets.Split(Separator); + + var parts = words.Select( + (word, index) => new FailurePart( + word, + Enum.TryParse(classes[index], true, out var classification) ? classification : throw new Exception($"Invalid failure classification '{classes[index]}'"), + int.TryParse(offsets[index], out var offset) ? offset : throw new Exception($"Invalid offset '{row.PartOffsets}'") + ) + ).ToList(); + + if (row.ProblemField != "PixelData") + { + // Fixes any offsets that have been mangled by file endings etc. + foreach (var part in parts) + { + if (row.ProblemValue.Substring(part.Offset, part.Word.Length) == part.Word) + continue; + + // Test if the ProblemValue has been HTML escaped + var encodedPartWord = WebUtility.HtmlEncode(part.Word); + try + { + if (row.ProblemValue.Substring(part.Offset, encodedPartWord.Length) == encodedPartWord) { - if (!string.IsNullOrWhiteSpace(partRule.IfColumn) && !string.Equals(partRule.IfColumn, row.ProblemField, StringComparison.InvariantCultureIgnoreCase)) - continue; - - foreach (var part in parts.Where(x => partRule.Covers(x, row.ProblemValue))) - { - toRemove.Add(part); - partRule.IncrementUsed(); - } + part.Word = encodedPartWord; + continue; } - parts = parts.Except(toRemove).ToList(); - /* TEMP */ - - if (parts.Any()) - failures.Add(new Failure(parts) - { - Resource = row.Resource, - ResourcePrimaryKey = row.ResourcePrimaryKey, - ProblemField = row.ProblemField, - ProblemValue = row.ProblemValue, - }); - - Interlocked.Increment(ref totalProcessed); } - ); + catch (ArgumentOutOfRangeException) + { } + + // Test if the ProblemValue has hidden unicode symbols + var withoutInvisible = Regex.Replace(row.ProblemValue, @"\p{C}+", string.Empty); + if (withoutInvisible.Substring(part.Offset, part.Word.Length) == part.Word) + { + part.Word = row.ProblemValue.Substring(part.Offset, part.Word.Length + 1); + + if (row.ProblemValue.Substring(part.Offset, part.Word.Length) != part.Word) + throw new Exception($"Could not fix hidden unicode characters in Failure:\n===\n{row}\n==="); + + continue; + } + + // Finally, try shifting the offset around to find the word + try + { + FixupOffsets(row, part); + } + catch (ArgumentOutOfRangeException e) + { + throw new Exception($"Could not fixup Offset value in Failure:\n{row}", e); + } + } } - finally + + /* TEMP - Filter out any FailureParts covered by an PartPatternFilterRule */ + var toRemove = new List(); + foreach (var partRule in partRules) { - localTokenSource.Cancel(); - timerTask.Wait(); + if (!string.IsNullOrWhiteSpace(partRule.IfColumn) && !string.Equals(partRule.IfColumn, row.ProblemField, StringComparison.InvariantCultureIgnoreCase)) + continue; + + foreach (var part in parts.Where(x => partRule.Covers(x, row.ProblemValue))) + { + toRemove.Add(part); + partRule.IncrementUsed(); + } } + parts = parts.Except(toRemove).ToList(); + /* TEMP */ - loadedRows(totalProcessed); + if (parts.Any()) + failures.Add(new Failure(parts) + { + Resource = row.Resource, + ResourcePrimaryKey = row.ResourcePrimaryKey, + ProblemField = row.ProblemField, + ProblemValue = row.ProblemValue, + }); - return failures; + Interlocked.Increment(ref totalProcessed); } private static void FixupOffsets(FailureStoreReportRecord row, FailurePart part) diff --git a/ii/Program.cs b/ii/Program.cs index b7e20739..c0ffb2ed 100644 --- a/ii/Program.cs +++ b/ii/Program.cs @@ -6,12 +6,14 @@ using FAnsi.Implementations.PostgreSql; using FellowOakDicom; using IsIdentifiable.Options; +using IsIdentifiable.Reporting.Reports; using IsIdentifiable.Runners; using Microsoft.Extensions.FileSystemGlobbing; using System; using System.IO.Abstractions; using System.Linq; using System.Text.RegularExpressions; +using System.Threading; using YamlDotNet.Serialization; namespace ii; @@ -101,13 +103,15 @@ public static int Main(string[] args) IsIdentifiableDicomFileOptions, IsIdentifiableMongoOptions, IsIdentifiableFileGlobOptions, - IsIdentifiableReviewerOptions>(args) + IsIdentifiableReviewerOptions, + IsIdentifiableReportValidatorOptions>(args) .MapResult( (IsIdentifiableRelationalDatabaseOptions o) => Run(o, fileSystem), (IsIdentifiableDicomFileOptions o) => Run(o, fileSystem), (IsIdentifiableMongoOptions o) => Run(o, fileSystem), (IsIdentifiableFileGlobOptions o) => Run(o, fileSystem), (IsIdentifiableReviewerOptions o) => Run(o, fileSystem), + (IsIdentifiableReportValidatorOptions o) => Run(o, fileSystem), // return exit code 0 for user requests for help errors => args.Any(a => a.Equals("--help", StringComparison.InvariantCultureIgnoreCase)) ? 0 : 1); @@ -146,6 +150,30 @@ private static int Run(IsIdentifiableReviewerOptions opts, IFileSystem fileSyste return reviewer.Run(); } + private static int Run(IsIdentifiableReportValidatorOptions opts, IFileSystem fileSystem) + { + if (GlobalOptions?.IsIdentifiableReviewerOptions != null) + opts.InheritValuesFrom(GlobalOptions.IsIdentifiableReviewerOptions); + + if (!fileSystem.File.Exists(opts.FailuresCsv)) + { + Console.Error.WriteLine($"Error: Could not find {opts.FailuresCsv}"); + return 1; + } + + const string expectedHeader = "Resource,ResourcePrimaryKey,ProblemField,ProblemValue,PartWords,PartClassifications,PartOffsets"; + var line = fileSystem.File.ReadLines(opts.FailuresCsv).FirstOrDefault(); + if (line == null || Regex.Replace(line, @"\s+", "") != line) + { + Console.Error.WriteLine($"Error: Expected CSV Failure header {expectedHeader}"); + return 1; + } + + var report = new FailureStoreReport("", 0, fileSystem); + var failures = FailureStoreReport.Deserialize(fileSystem.FileInfo.New(opts.FailuresCsv), (_) => { }, new CancellationTokenSource().Token, partRules: null, runParallel: false).ToArray(); + + return 0; + } private static int Run(IsIdentifiableDicomFileOptions opts, IFileSystem fileSystem) {