From 6948d07405f54d21456a50d5ca7174e68e036f42 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 12:46:00 -0400 Subject: [PATCH 01/22] Fix an off-by-one in the readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ba574db..b1382b8 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ public sealed class MyVisitor : CsvReaderVisitorBase private void VisitFieldContents(ReadOnlySpan chunk, bool flush) { int charCount = _utf8Decoder.GetCharCount(chunk, flush); - if (charCount + _bufferConsumed < _buffer.Length) + if (charCount + _bufferConsumed <= _buffer.Length) { _utf8Decoder.GetChars(chunk, new Span(_buffer, _bufferConsumed, charCount), flush); _bufferConsumed += charCount; From 18a757a7c737363abed94246c8016ee5bede4b7e Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 13:56:36 -0400 Subject: [PATCH 02/22] Add memory-mapped file helper. Resolves #9 --- src/Cursively/Csv.cs | 64 +++++++++++++++++++++++ src/Cursively/Cursively.csproj | 1 + test/Cursively.Benchmark/Program.cs | 22 ++++---- test/Cursively.Tests/CsvTokenizerTests.cs | 24 ++++++++- 4 files changed, 96 insertions(+), 15 deletions(-) create mode 100644 src/Cursively/Csv.cs diff --git a/src/Cursively/Csv.cs b/src/Cursively/Csv.cs new file mode 100644 index 0000000..e23740b --- /dev/null +++ b/src/Cursively/Csv.cs @@ -0,0 +1,64 @@ +using System; +using System.IO; +using System.IO.MemoryMappedFiles; + +namespace Cursively +{ + /// + /// Contains helper methods for CSV processing. + /// + public static class Csv + { + /// + /// Describes the contents of a CSV file to the given instance of the + /// class, using memory-mapped files behind the scenes. + /// + /// + /// The path to the CSV file to describe. + /// + /// + /// The instance to describe the file to. + /// + public static unsafe void ProcessMemoryMappedFile(string csvFilePath, CsvReaderVisitorBase visitor) + { + using (var fl = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + long length = fl.Length; + if (length == 0) + { + return; + } + + var tokenizer = new CsvTokenizer(); + using (var memoryMappedFile = MemoryMappedFile.CreateFromFile(fl, null, 0, MemoryMappedFileAccess.Read, HandleInheritability.None, leaveOpen: true)) + using (var accessor = memoryMappedFile.CreateViewAccessor(0, 0, MemoryMappedFileAccess.Read)) + { + var handle = accessor.SafeMemoryMappedViewHandle; + byte* ptr = null; + try + { + handle.AcquirePointer(ref ptr); + for (long rem = length; rem > 0; rem -= int.MaxValue) + { + int currentChunkLength = rem < int.MaxValue + ? unchecked((int)rem) + : int.MaxValue; + + var span = new ReadOnlySpan(ptr, currentChunkLength); + tokenizer.ProcessNextChunk(span, visitor); + } + + tokenizer.ProcessEndOfStream(visitor); + } + finally + { + if (ptr != null) + { + handle.ReleasePointer(); + } + } + } + } + } + } +} diff --git a/src/Cursively/Cursively.csproj b/src/Cursively/Cursively.csproj index a702484..6f5c7ae 100644 --- a/src/Cursively/Cursively.csproj +++ b/src/Cursively/Cursively.csproj @@ -2,6 +2,7 @@ netstandard2.0 + true diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs index 38d3d9c..30a987e 100644 --- a/test/Cursively.Benchmark/Program.cs +++ b/test/Cursively.Benchmark/Program.cs @@ -11,32 +11,28 @@ namespace Cursively.Benchmark { - [ClrJob] - [CoreJob] - [CoreRtJob] - [GcServer(true)] - [MemoryDiagnoser] + [ClrJob, CoreJob, GcServer(true), MemoryDiagnoser] public class Program { public static CsvFile[] CsvFiles => GetCsvFiles(); [Benchmark(Baseline = true)] [ArgumentsSource(nameof(CsvFiles))] - public void NopUsingCursively(CsvFile csvFile) + public long CountRowsUsingCursivelyByteArray(CsvFile csvFile) { + var visitor = new RowCountingVisitor(); var tokenizer = new CsvTokenizer(); - tokenizer.ProcessNextChunk(csvFile.FileData, null); - tokenizer.ProcessEndOfStream(null); + tokenizer.ProcessNextChunk(csvFile.FileData, visitor); + tokenizer.ProcessEndOfStream(visitor); + return visitor.RowCount; } [Benchmark] [ArgumentsSource(nameof(CsvFiles))] - public long CountRowsUsingCursively(CsvFile csvFile) + public long CountRowsUsingCursivelyWithMemoryMappedFile(CsvFile csvFile) { var visitor = new RowCountingVisitor(); - var tokenizer = new CsvTokenizer(); - tokenizer.ProcessNextChunk(csvFile.FileData, visitor); - tokenizer.ProcessEndOfStream(visitor); + Csv.ProcessMemoryMappedFile(csvFile.FullPath, visitor); return visitor.RowCount; } @@ -63,7 +59,7 @@ private static int Main() var prog = new Program(); foreach (var csvFile in CsvFiles) { - if (prog.CountRowsUsingCursively(csvFile) != prog.CountRowsUsingCsvHelper(csvFile)) + if (prog.CountRowsUsingCursivelyByteArray(csvFile) != prog.CountRowsUsingCsvHelper(csvFile)) { Console.Error.WriteLine($"Failed on {csvFile}."); return 1; diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs index 0c18b40..d1d2849 100644 --- a/test/Cursively.Tests/CsvTokenizerTests.cs +++ b/test/Cursively.Tests/CsvTokenizerTests.cs @@ -18,13 +18,17 @@ public sealed class CsvTokenizerTests private static readonly int[] TestChunkLengths = { 1, 2, 3, 5, 8, 13, 21, 34 }; public static IEnumerable TestCsvFiles => + from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv") + select new object[] { filePath }; + + public static IEnumerable TestCsvFilesWithChunkLengths => from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv") let fileName = Path.GetFileNameWithoutExtension(filePath) from chunkLength in TestChunkLengths select new object[] { fileName, chunkLength }; [Theory] - [MemberData(nameof(TestCsvFiles))] + [MemberData(nameof(TestCsvFilesWithChunkLengths))] public void NullVisitorShouldBeFine(string fileName, int chunkLength) { // arrange @@ -46,7 +50,7 @@ public void NullVisitorShouldBeFine(string fileName, int chunkLength) } [Theory] - [MemberData(nameof(TestCsvFiles))] + [MemberData(nameof(TestCsvFilesWithChunkLengths))] public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength) { // arrange @@ -64,6 +68,22 @@ public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength } } + [Theory] + [MemberData(nameof(TestCsvFiles))] + public void MemoryMappedCsvShouldMatchCsvHelper(string filePath) + { + // arrange + var visitor = new StringBufferingVisitor(checked((int)new FileInfo(filePath).Length)); + + // act + Csv.ProcessMemoryMappedFile(filePath, visitor); + var actual = visitor.Lines; + + // assert + var expected = TokenizeCsvFileUsingCsvHelper(File.ReadAllBytes(filePath)); + Assert.Equal(expected, actual); + } + private static List TokenizeCsvFileUsingMine(ReadOnlySpan fileData, int chunkLength) { var tokenizer = new CsvTokenizer(); From a20b068d9c27457749170777e1a180832bf8ba61 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Mon, 27 May 2019 10:14:50 -0400 Subject: [PATCH 03/22] doc updates --- doc/benchmark-1.0.0.md | 71 ++++++++++++++++++++++++++++++++++++++++++ doc/toc.yml | 2 ++ 2 files changed, 73 insertions(+) create mode 100644 doc/benchmark-1.0.0.md diff --git a/doc/benchmark-1.0.0.md b/doc/benchmark-1.0.0.md new file mode 100644 index 0000000..6ba25b1 --- /dev/null +++ b/doc/benchmark-1.0.0.md @@ -0,0 +1,71 @@ +This benchmark tests the simple act of counting how many records are in a CSV file. It's not a simple count of how many lines are in the text file: line breaks within quoted fields must be treated as data, and multiple line breaks in a row must be treated as one, since each record must have at least one field. Therefore, assuming correct implementations, this benchmark should test the raw CSV processing speed. + +Cursively eliminates a ton of overhead found in libraries such as CsvHelper by restricting the allowed input encodings and using the visitor pattern as its only means of output. Cursively can scan through the original bytes of the input to do its work, and it can give slices of the input data directly to the consumer without having to copy or allocate. + +Therefore, these benchmarks are somewhat biased in favor of Cursively, as CsvHelper relies on external code to transform the data to UTF-16. This isn't as unfair as that makes it sound: the overwhelming majority of input files are probably UTF-8 anyway (or a compatible SBCS), so this transformation is something that practically every user will experience. + +- Input files can be found here: https://github.com/airbreather/Cursively/tree/v1.0.0/test/Cursively.Benchmark/large-csv-files +- Benchmark source code is a slightly edited* version of this: https://github.com/airbreather/Cursively/tree/v1.0.0/test/Cursively.Benchmark + - *edited only to remove `CoreRtJob` and the more-or-less redundant `NopUsingCursively` + +Raw BenchmarkDotNet output is at the bottom, but here are some numbers derived from it. The data was fully loaded in main memory when running these tests. This summary also does not indicate anything about the GC pressure: + +|CSV File|Runtime|Library|Throughput| +|-|-|-|-| +|100 records / 10,000 tiny fields each|.NET 4.7.2|Cursively|99.81 MiB/s| +|100 records / 10,000 tiny fields each|.NET 4.7.2|CsvHelper|22.60 MiB/s| +|100 records / 10,000 tiny fields each|.NET Core 2.2.5|Cursively|126.1 MiB/s| +|100 records / 10,000 tiny fields each|.NET Core 2.2.5|CsvHelper|25.32 MiB/s| +|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|Cursively|118.5 MiB/s| +|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|CsvHelper|25.05 MiB/s| +|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|Cursively|187.0 MiB/s| +|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|CsvHelper|27.96 MiB/s| +|10,000 records / 1,000 empty fields each|.NET 4.7.2|Cursively|64.15 MiB/s| +|10,000 records / 1,000 empty fields each|.NET 4.7.2|CsvHelper|15.57 MiB/s| +|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|Cursively|112.7 MiB/s| +|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|CsvHelper|14.84 MiB/s| +|Mock data from Mockaroo|.NET 4.7.2|Cursively|1.637 GiB/s| +|Mock data from Mockaroo|.NET 4.7.2|CsvHelper|74.81 MiB/s| +|Mock data from Mockaroo|.NET Core 2.2.5|Cursively|1.893 GiB/s| +|Mock data from Mockaroo|.NET Core 2.2.5|CsvHelper|66.86 MiB/s| + +Raw BenchmarkDotNet output: + +``` ini + +BenchmarkDotNet=v0.11.5, OS=Windows 10.0.17134.765 (1803/April2018Update/Redstone4) +Intel Core i7-6850K CPU 3.60GHz (Skylake), 1 CPU, 12 logical and 6 physical cores +Frequency=3515622 Hz, Resolution=284.4447 ns, Timer=TSC +.NET Core SDK=2.2.300 + [Host] : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT + Job-ASLTDW : .NET Framework 4.7.2 (CLR 4.0.30319.42000), 64bit RyuJIT-v4.7.3416.0 + Job-RICADF : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT + +Server=True + +``` +| Method | Runtime | csvFile | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | +|------------------------ |-------- |--------------------- |-----------:|----------:|----------:|------:|--------:|-----------:|----------:|---------:|------------:| +| CountRowsUsingCursively | Clr | 100-huge-records | 27.714 ms | 0.0126 ms | 0.0105 ms | 1.00 | 0.00 | - | - | - | 256 B | +| CountRowsUsingCsvHelper | Clr | 100-huge-records | 122.397 ms | 0.1685 ms | 0.1494 ms | 4.42 | 0.01 | 17250.0000 | 6250.0000 | 750.0000 | 110257334 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Core | 100-huge-records | 21.932 ms | 0.0254 ms | 0.0226 ms | 1.00 | 0.00 | - | - | - | 56 B | +| CountRowsUsingCsvHelper | Core | 100-huge-records | 109.261 ms | 0.3319 ms | 0.3104 ms | 4.98 | 0.02 | 400.0000 | 200.0000 | - | 110256320 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Clr | 100-h(...)uoted [23] | 39.453 ms | 0.0974 ms | 0.0864 ms | 1.00 | 0.00 | - | - | - | 683 B | +| CountRowsUsingCsvHelper | Clr | 100-h(...)uoted [23] | 186.572 ms | 0.4682 ms | 0.4380 ms | 4.73 | 0.01 | 24666.6667 | 9666.6667 | 666.6667 | 153595995 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Core | 100-h(...)uoted [23] | 24.995 ms | 0.0160 ms | 0.0142 ms | 1.00 | 0.00 | - | - | - | 56 B | +| CountRowsUsingCsvHelper | Core | 100-h(...)uoted [23] | 167.160 ms | 0.3437 ms | 0.3215 ms | 6.69 | 0.02 | 333.3333 | - | - | 153579848 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Clr | 10k-empty-records | 148.952 ms | 0.2502 ms | 0.2340 ms | 1.00 | 0.00 | - | - | - | 2048 B | +| CountRowsUsingCsvHelper | Clr | 10k-empty-records | 613.718 ms | 0.8869 ms | 0.7862 ms | 4.12 | 0.01 | 66000.0000 | 2000.0000 | - | 420838944 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Core | 10k-empty-records | 84.801 ms | 0.1079 ms | 0.1009 ms | 1.00 | 0.00 | - | - | - | 56 B | +| CountRowsUsingCsvHelper | Core | 10k-empty-records | 644.051 ms | 2.8782 ms | 2.5515 ms | 7.60 | 0.03 | 2000.0000 | - | - | 420832856 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Clr | mocked | 7.242 ms | 0.0233 ms | 0.0207 ms | 1.00 | 0.00 | - | - | - | 64 B | +| CountRowsUsingCsvHelper | Clr | mocked | 162.298 ms | 0.2958 ms | 0.2622 ms | 22.41 | 0.08 | 18000.0000 | 333.3333 | - | 115764389 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Core | mocked | 6.264 ms | 0.0115 ms | 0.0107 ms | 1.00 | 0.00 | - | - | - | 56 B | +| CountRowsUsingCsvHelper | Core | mocked | 181.592 ms | 0.3413 ms | 0.3193 ms | 28.99 | 0.09 | 333.3333 | - | - | 115757736 B | diff --git a/doc/toc.yml b/doc/toc.yml index aa4ba7c..0e11d66 100644 --- a/doc/toc.yml +++ b/doc/toc.yml @@ -2,6 +2,8 @@ href: index.md - name: API Documentation href: obj/api/ +- name: Benchmark + href: benchmark-1.0.0.md - name: Release Notes href: release-notes.md - name: NuGet Package From 69d65d83aef64e666faccf734f2774735bbd016e Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Tue, 28 May 2019 09:44:01 -0400 Subject: [PATCH 04/22] Support custom delimiters. Resolves #11 --- src/Cursively/CsvTokenizer.cs | 215 +++++++++++++--------- test/Cursively.Benchmark/Program.cs | 4 +- test/Cursively.Tests/CsvTokenizerTests.cs | 56 +++--- 3 files changed, 159 insertions(+), 116 deletions(-) diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index 8dd039e..1a44af0 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -142,20 +142,50 @@ namespace Cursively /// public class CsvTokenizer { - private const byte COMMA = (byte)','; - private const byte CR = (byte)'\r'; private const byte LF = (byte)'\n'; private const byte QUOTE = (byte)'"'; - private static readonly byte[] AllStopBytes = { COMMA, QUOTE, CR, LF }; - - private static readonly byte[] AllStopBytesExceptQuote = { COMMA, CR, LF }; + private readonly byte _delimiter; private ParserFlags _parserFlags; + /// + /// Initializes a new instance of the class. + /// + public CsvTokenizer() + : this((byte)',') + { + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// The single byte to expect to see between fields of the same record. This may not be an + /// end-of-line or double-quote character, as those have special meanings. + /// + /// + /// Thrown when is 0x0A, 0x0D, or + /// 0x22. + /// + public CsvTokenizer(byte delimiter) + { + switch (delimiter) + { + case CR: + case LF: + case QUOTE: + throw new ArgumentException("Must not be a carriage return, linefeed, or double-quote.", nameof(delimiter)); + + default: + _delimiter = delimiter; + break; + } + } + [Flags] private enum ParserFlags : byte { @@ -189,8 +219,7 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi visitor = CsvReaderVisitorBase.Null; } - // cache the implicit conversion for the sake of "portable span" targets. - ReadOnlySpan allStopBytes = AllStopBytes; + byte delimiter = _delimiter; // we're going to consume the entire buffer that was handed to us. while (!chunk.IsEmpty) @@ -204,17 +233,11 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi continue; } - int idx = chunk.IndexOfAny(allStopBytes); - if (idx < 0) - { - visitor.VisitPartialFieldContents(chunk); - _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine; - break; - } - - switch (chunk[idx]) + for (int idx = 0; idx < chunk.Length; idx++) { - case QUOTE: + byte c = chunk[idx]; + if (c == QUOTE) + { if (idx == 0) { _parserFlags = ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine; @@ -227,20 +250,30 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1)); _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine; } - - break; - - case COMMA: + } + else if (c == delimiter) + { visitor.VisitEndOfField(chunk.Slice(0, idx)); _parserFlags = ParserFlags.ReadAnythingOnCurrentLine; - break; - - default: + } + else if (c == CR || c == LF) + { ProcessEndOfLine(chunk.Slice(0, idx), visitor); - break; + } + else + { + continue; + } + + chunk = chunk.Slice(idx + 1); + goto nextLoop; } - chunk = chunk.Slice(idx + 1); + visitor.VisitPartialFieldContents(chunk); + _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine; + break; + + nextLoop:; } } @@ -303,69 +336,69 @@ private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisi } // we have at least one more byte, so let's see what the double quote actually means - switch (readBuffer[idx + 1]) + byte b = readBuffer[idx + 1]; + if (b == QUOTE) { - case QUOTE: - // the double quote we stopped at was escaping a literal double quote, so we - // send everything up to and including the escaping quote. - visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx + 1)); - break; - - case COMMA: - // the double quote was the end of a quoted field, so send the entire data - // from the beginning of this quoted field data chunk up to the double quote - // that terminated it (excluding, of course, the double quote itself). - visitor.VisitEndOfField(readBuffer.Slice(0, idx)); - _parserFlags = ParserFlags.ReadAnythingOnCurrentLine; - break; - - case CR: - case LF: - // same thing as the COMMA case, just the field ended at the end of a line - // instead of the end of a field on the current line. - ProcessEndOfLine(readBuffer.Slice(0, idx), visitor); - break; - - default: - // the double quote was the end of the quoted part of the field data, but - // then it continues on with more data; don't spend too much time optimizing - // this case since it's not RFC 4180, just do the parts we need to do in - // order to behave the way we said we would. - _parserFlags |= ParserFlags.QuotedFieldDataEnded; - visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx)); - visitor.VisitPartialFieldContents(readBuffer.Slice(idx + 1, 1)); - break; + // the double quote we stopped at was escaping a literal double quote, so we + // send everything up to and including the escaping quote. + visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx + 1)); + } + else if (b == _delimiter) + { + // the double quote was the end of a quoted field, so send the entire data from + // the beginning of this quoted field data chunk up to the double quote that + // terminated it (excluding, of course, the double quote itself). + visitor.VisitEndOfField(readBuffer.Slice(0, idx)); + _parserFlags = ParserFlags.ReadAnythingOnCurrentLine; + } + else if (b == CR || b == LF) + { + // same thing as the delimiter case, just the field ended at the end of a line + // instead of the end of a field on the current line. + ProcessEndOfLine(readBuffer.Slice(0, idx), visitor); + } + else + { + // the double quote was the end of the quoted part of the field data, but then + // it continues on with more data; don't spend too much time optimizing this + // case since it's not RFC 4180, just do the parts we need to do in order to + // behave the way we said we would. + _parserFlags |= ParserFlags.QuotedFieldDataEnded; + visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx)); + visitor.VisitPartialFieldContents(readBuffer.Slice(idx + 1, 1)); } // slice off the data up to the quote and the next byte that we read. readBuffer = readBuffer.Slice(idx + 2); - return; } - - // this is expected to be rare: either we were cut between field reads, or we're reading - // nonstandard field data where there's a quote that neither starts nor ends the field. + else { - int idx = readBuffer.IndexOfAny(AllStopBytesExceptQuote); - if (idx < 0) + // this is expected to be rare: either we were cut between field reads, or we're + // reading nonstandard field data where there's a quote that neither starts nor ends + // the field. + for (int idx = 0; idx < readBuffer.Length; idx++) { - visitor.VisitPartialFieldContents(readBuffer); - readBuffer = default; - return; - } - - switch (readBuffer[idx]) - { - case COMMA: + byte b = readBuffer[idx]; + if (b == _delimiter) + { visitor.VisitEndOfField(readBuffer.Slice(0, idx)); _parserFlags = ParserFlags.ReadAnythingOnCurrentLine; - break; - - default: + } + else if (b == CR || b == LF) + { ProcessEndOfLine(readBuffer.Slice(0, idx), visitor); - break; + } + else + { + continue; + } + + readBuffer = readBuffer.Slice(idx + 1); + return; } - readBuffer = readBuffer.Slice(idx + 1); + visitor.VisitPartialFieldContents(readBuffer); + readBuffer = default; } } @@ -379,24 +412,22 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan TestCsvFiles => from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv") select new object[] { filePath }; - public static IEnumerable TestCsvFilesWithChunkLengths => + public static IEnumerable TestCsvFilesWithChunkLengthsAndDelimiters => from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv") let fileName = Path.GetFileNameWithoutExtension(filePath) from chunkLength in TestChunkLengths - select new object[] { fileName, chunkLength }; + from delimiter in TestDelimiters + select new object[] { fileName, chunkLength, delimiter }; + + [Theory] + [InlineData((byte)0x0A)] + [InlineData((byte)0x0D)] + [InlineData((byte)0x22)] + public void ConstructorShouldRejectInvalidDelimiters(byte delimiter) + { + Assert.Throws("delimiter", () => new CsvTokenizer(delimiter)); + } [Theory] - [MemberData(nameof(TestCsvFilesWithChunkLengths))] - public void NullVisitorShouldBeFine(string fileName, int chunkLength) + [MemberData(nameof(TestCsvFiles))] + public void NullVisitorShouldBeFine(string filePath) { // arrange - string fullCsvFilePath = Path.Combine(TestCsvFilesFolderPath, fileName + ".csv"); - ReadOnlySpan fileData = File.ReadAllBytes(fullCsvFilePath); + ReadOnlySpan fileData = File.ReadAllBytes(filePath); var tokenizer = new CsvTokenizer(); // act - while (fileData.Length >= chunkLength) - { - tokenizer.ProcessNextChunk(fileData.Slice(0, chunkLength), null); - fileData = fileData.Slice(chunkLength); - } - tokenizer.ProcessNextChunk(fileData, null); tokenizer.ProcessEndOfStream(null); @@ -50,20 +55,27 @@ public void NullVisitorShouldBeFine(string fileName, int chunkLength) } [Theory] - [MemberData(nameof(TestCsvFilesWithChunkLengths))] - public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength) + [MemberData(nameof(TestCsvFilesWithChunkLengthsAndDelimiters))] + public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength, byte delimiter) { // arrange byte[] fileDataTemplate = File.ReadAllBytes(Path.Combine(TestCsvFilesFolderPath, fileName + ".csv")); + for (int i = 0; i < fileDataTemplate.Length; i++) + { + if (fileDataTemplate[i] == (byte)',') + { + fileDataTemplate[i] = delimiter; + } + } - int randomSeed = HashCode.Combine(fileName, chunkLength); + int randomSeed = HashCode.Combine(fileName, chunkLength, delimiter); foreach (byte[] fileData in VaryLineEndings(fileDataTemplate, randomSeed)) { // act - var actual = TokenizeCsvFileUsingMine(fileData, chunkLength); + var actual = TokenizeCsvFileUsingCursively(fileData, chunkLength, delimiter); // assert - var expected = TokenizeCsvFileUsingCsvHelper(fileData); + var expected = TokenizeCsvFileUsingCsvHelper(fileData, $"{(char)delimiter}"); Assert.Equal(expected, actual); } } @@ -80,13 +92,13 @@ public void MemoryMappedCsvShouldMatchCsvHelper(string filePath) var actual = visitor.Lines; // assert - var expected = TokenizeCsvFileUsingCsvHelper(File.ReadAllBytes(filePath)); + var expected = TokenizeCsvFileUsingCsvHelper(File.ReadAllBytes(filePath), ","); Assert.Equal(expected, actual); } - private static List TokenizeCsvFileUsingMine(ReadOnlySpan fileData, int chunkLength) + private static List TokenizeCsvFileUsingCursively(ReadOnlySpan fileData, int chunkLength, byte delimiter) { - var tokenizer = new CsvTokenizer(); + var tokenizer = new CsvTokenizer(delimiter); var visitor = new StringBufferingVisitor(fileData.Length); while (fileData.Length >= chunkLength) { @@ -99,11 +111,11 @@ private static List TokenizeCsvFileUsingMine(ReadOnlySpan fileDa return visitor.Lines; } - private static IEnumerable TokenizeCsvFileUsingCsvHelper(byte[] csvData) + private static IEnumerable TokenizeCsvFileUsingCsvHelper(byte[] csvData, string delimiter) { using (var stream = new MemoryStream(csvData, false)) using (var streamReader = new StreamReader(stream, new UTF8Encoding(false, false), false)) - using (var csvReader = new CsvReader(streamReader, new Configuration { BadDataFound = null })) + using (var csvReader = new CsvReader(streamReader, new Configuration { BadDataFound = null, Delimiter = delimiter })) { while (csvReader.Read()) { From b3be94f654beb2b03049bdb2bfd7e417453c4a72 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Wed, 5 Jun 2019 10:13:09 -0400 Subject: [PATCH 05/22] Try setting up LGTM --- Directory.Build.props | 2 +- lgtm.yml | 4 ++++ src/Directory.Build.props | 5 ++++- 3 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 lgtm.yml diff --git a/Directory.Build.props b/Directory.Build.props index b6bbf76..491987b 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -8,10 +8,10 @@ true 7.3 - true + true true diff --git a/lgtm.yml b/lgtm.yml new file mode 100644 index 0000000..1052cbe --- /dev/null +++ b/lgtm.yml @@ -0,0 +1,4 @@ +extraction: + csharp: + after_prepare: + - export LGTM=true diff --git a/src/Directory.Build.props b/src/Directory.Build.props index a83c324..4b420f8 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -21,8 +21,11 @@ false - + + + + From c2e3f78fb32bbd6a815ada10ab15062dde5a8253 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Wed, 5 Jun 2019 10:22:25 -0400 Subject: [PATCH 06/22] One more try to get LGTM to work. --- Directory.Build.props | 2 +- lgtm.yml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Directory.Build.props b/Directory.Build.props index 491987b..b10b8ad 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -15,7 +15,7 @@ true - + diff --git a/lgtm.yml b/lgtm.yml index 1052cbe..0c3c07c 100644 --- a/lgtm.yml +++ b/lgtm.yml @@ -2,3 +2,5 @@ extraction: csharp: after_prepare: - export LGTM=true + index: + solution: src/Cursively/Cursively.csproj From c973ecc27ed8985a5d9354d695e2b4eda58fdd2f Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 6 Jun 2019 08:59:42 -0400 Subject: [PATCH 07/22] Add worldcitiespop.csv for benchmarking. Apparently, this is a more-or-less standard CSV processing benchmark file. --- README.md | 10 +-- test/Cursively.Benchmark/Program.cs | 83 +++++++++++++++++-- .../large-csv-files/worldcitiespop.csv | 3 + 3 files changed, 84 insertions(+), 12 deletions(-) create mode 100644 test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv diff --git a/README.md b/README.md index b1382b8..7b8411d 100644 --- a/README.md +++ b/README.md @@ -79,14 +79,12 @@ public sealed class MyVisitor : CsvReaderVisitorBase throw new InvalidDataException($"Field is longer than {_buffer.Length} characters."); } - if (!flush) + if (flush) { - return; + Console.Write("Field: "); + Console.WriteLine(_buffer, 0, _bufferConsumed); + _bufferConsumed = 0; } - - Console.Write("Field: "); - Console.WriteLine(_buffer, 0, _bufferConsumed); - _bufferConsumed = 0; } } ``` diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs index 7279bf2..b45fcd6 100644 --- a/test/Cursively.Benchmark/Program.cs +++ b/test/Cursively.Benchmark/Program.cs @@ -11,9 +11,80 @@ namespace Cursively.Benchmark { - [ClrJob, CoreJob, GcServer(true), MemoryDiagnoser] + [ClrJob] + [CoreJob] + [GcServer(true)] + [MemoryDiagnoser] public class Program { + public static void ProcessCsvFile(string csvFilePath) + { + var myVisitor = new MyVisitor(maxFieldLength: 1000); + var tokenizer = new CsvTokenizer(); + using (var file = File.OpenRead(csvFilePath)) + { + Console.WriteLine($"Started reading '{csvFilePath}'."); + Span fileReadBuffer = new byte[4096]; + while (true) + { + int count = file.Read(fileReadBuffer); + if (count == 0) + { + break; + } + + var chunk = fileReadBuffer.Slice(0, count); + tokenizer.ProcessNextChunk(chunk, myVisitor); + } + + tokenizer.ProcessEndOfStream(myVisitor); + } + + Console.WriteLine($"Finished reading '{csvFilePath}'."); + } + + public sealed class MyVisitor : CsvReaderVisitorBase + { + private readonly Decoder _utf8Decoder = Encoding.UTF8.GetDecoder(); + + private readonly char[] _buffer; + + private int _bufferConsumed; + + public MyVisitor(int maxFieldLength) => + _buffer = new char[maxFieldLength]; + + public override void VisitPartialFieldContents(ReadOnlySpan chunk) => + VisitFieldContents(chunk, flush: false); + + public override void VisitEndOfField(ReadOnlySpan chunk) => + VisitFieldContents(chunk, flush: true); + + public override void VisitEndOfRecord() => + Console.WriteLine("End of fields for this record."); + + private void VisitFieldContents(ReadOnlySpan chunk, bool flush) + { + int charCount = _utf8Decoder.GetCharCount(chunk, flush); + if (charCount + _bufferConsumed <= _buffer.Length) + { + _utf8Decoder.GetChars(chunk, new Span(_buffer, _bufferConsumed, charCount), flush); + _bufferConsumed += charCount; + } + else + { + throw new InvalidDataException($"Field is longer than {_buffer.Length} characters."); + } + + if (flush) + { + Console.Write("Field: "); + Console.WriteLine(_buffer, 0, _bufferConsumed); + _bufferConsumed = 0; + } + } + } + public static CsvFile[] CsvFiles => GetCsvFiles(); [Benchmark(Baseline = true)] @@ -27,7 +98,7 @@ public long CountRowsUsingCursivelyByteArray(CsvFile csvFile) return visitor.RowCount; } - ////[Benchmark] + [Benchmark] [ArgumentsSource(nameof(CsvFiles))] public long CountRowsUsingCursivelyWithMemoryMappedFile(CsvFile csvFile) { @@ -36,7 +107,7 @@ public long CountRowsUsingCursivelyWithMemoryMappedFile(CsvFile csvFile) return visitor.RowCount; } - ////[Benchmark] + [Benchmark] [ArgumentsSource(nameof(CsvFiles))] public long CountRowsUsingCsvHelper(CsvFile csvFile) { @@ -59,7 +130,9 @@ private static int Main() var prog = new Program(); foreach (var csvFile in CsvFiles) { - if (prog.CountRowsUsingCursivelyByteArray(csvFile) != prog.CountRowsUsingCsvHelper(csvFile)) + long rowCount = prog.CountRowsUsingCursivelyByteArray(csvFile); + if (prog.CountRowsUsingCsvHelper(csvFile) != rowCount || + prog.CountRowsUsingCursivelyWithMemoryMappedFile(csvFile) != rowCount) { Console.Error.WriteLine($"Failed on {csvFile}."); return 1; @@ -90,8 +163,6 @@ private static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null) = private sealed class RowCountingVisitor : CsvReaderVisitorBase { - public long CharCount { get; private set; } - public long RowCount { get; private set; } public override void VisitEndOfRecord() => ++RowCount; diff --git a/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv b/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv new file mode 100644 index 0000000..01dfbfe --- /dev/null +++ b/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4c7824338bbbc228f5d3e0089c57233136e83853821252ade7ed556b4bcfc1b +size 151492068 From 0d4a907f26b1d292513ceaf7b5399ef347bcb97a Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 6 Jun 2019 09:04:15 -0400 Subject: [PATCH 08/22] Revert the sample code accidentally added here. --- test/Cursively.Benchmark/Program.cs | 68 ----------------------------- 1 file changed, 68 deletions(-) diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs index b45fcd6..ab36394 100644 --- a/test/Cursively.Benchmark/Program.cs +++ b/test/Cursively.Benchmark/Program.cs @@ -17,74 +17,6 @@ namespace Cursively.Benchmark [MemoryDiagnoser] public class Program { - public static void ProcessCsvFile(string csvFilePath) - { - var myVisitor = new MyVisitor(maxFieldLength: 1000); - var tokenizer = new CsvTokenizer(); - using (var file = File.OpenRead(csvFilePath)) - { - Console.WriteLine($"Started reading '{csvFilePath}'."); - Span fileReadBuffer = new byte[4096]; - while (true) - { - int count = file.Read(fileReadBuffer); - if (count == 0) - { - break; - } - - var chunk = fileReadBuffer.Slice(0, count); - tokenizer.ProcessNextChunk(chunk, myVisitor); - } - - tokenizer.ProcessEndOfStream(myVisitor); - } - - Console.WriteLine($"Finished reading '{csvFilePath}'."); - } - - public sealed class MyVisitor : CsvReaderVisitorBase - { - private readonly Decoder _utf8Decoder = Encoding.UTF8.GetDecoder(); - - private readonly char[] _buffer; - - private int _bufferConsumed; - - public MyVisitor(int maxFieldLength) => - _buffer = new char[maxFieldLength]; - - public override void VisitPartialFieldContents(ReadOnlySpan chunk) => - VisitFieldContents(chunk, flush: false); - - public override void VisitEndOfField(ReadOnlySpan chunk) => - VisitFieldContents(chunk, flush: true); - - public override void VisitEndOfRecord() => - Console.WriteLine("End of fields for this record."); - - private void VisitFieldContents(ReadOnlySpan chunk, bool flush) - { - int charCount = _utf8Decoder.GetCharCount(chunk, flush); - if (charCount + _bufferConsumed <= _buffer.Length) - { - _utf8Decoder.GetChars(chunk, new Span(_buffer, _bufferConsumed, charCount), flush); - _bufferConsumed += charCount; - } - else - { - throw new InvalidDataException($"Field is longer than {_buffer.Length} characters."); - } - - if (flush) - { - Console.Write("Field: "); - Console.WriteLine(_buffer, 0, _bufferConsumed); - _bufferConsumed = 0; - } - } - } - public static CsvFile[] CsvFiles => GetCsvFiles(); [Benchmark(Baseline = true)] From 2a310f1b5d1111afb96992cb9e7895c958648dee Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 13 Jun 2019 08:10:39 -0400 Subject: [PATCH 09/22] Add helpers for processing a Stream more easily. Resolves #10 --- src/Cursively/Csv.cs | 277 +++++++++++++++++++++- src/Cursively/Cursively.csproj | 1 + test/Cursively.Benchmark/Program.cs | 12 +- test/Cursively.Tests/CsvTokenizerTests.cs | 2 +- 4 files changed, 269 insertions(+), 23 deletions(-) diff --git a/src/Cursively/Csv.cs b/src/Cursively/Csv.cs index e23740b..917992a 100644 --- a/src/Cursively/Csv.cs +++ b/src/Cursively/Csv.cs @@ -1,6 +1,9 @@ using System; using System.IO; using System.IO.MemoryMappedFiles; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; namespace Cursively { @@ -10,8 +13,224 @@ namespace Cursively public static class Csv { /// - /// Describes the contents of a CSV file to the given instance of the - /// class, using memory-mapped files behind the scenes. + /// Describes the contents of a CSV stream to the given instance of the + /// class. + /// + /// + /// The CSV stream to describe. + /// + /// + /// The instance to describe the stream to. + /// + /// + /// Thrown when is . + /// + public static void ProcessStream(Stream csvStream, CsvReaderVisitorBase visitor) => + ProcessStream(csvStream, visitor, 81920); + + /// + /// Describes the contents of a CSV stream to the given instance of the + /// class. + /// + /// + /// The CSV stream to describe. + /// + /// + /// The instance to describe the stream to. + /// + /// + /// The length of the buffer to use (default: 81920). + /// + /// + /// Thrown when is . + /// + /// + /// Thrown when is not greater than zero. + /// + /// + /// Thrown when does not support reading (i.e., + /// is ). + /// + public static void ProcessStream(Stream csvStream, CsvReaderVisitorBase visitor, int bufferSize) + { + if (csvStream is null) + { + throw new ArgumentNullException(nameof(csvStream)); + } + + if (bufferSize <= 0) + { + throw new ArgumentOutOfRangeException(nameof(bufferSize), bufferSize, "Must be greater than zero."); + } + + if (!csvStream.CanRead) + { + throw new ArgumentException("Stream does not support reading.", nameof(csvStream)); + } + + byte[] buffer = new byte[bufferSize]; + var tokenizer = new CsvTokenizer(); + int cnt; + while ((cnt = csvStream.Read(buffer, 0, buffer.Length)) != 0) + { + tokenizer.ProcessNextChunk(new ReadOnlySpan(buffer, 0, cnt), visitor); + } + + tokenizer.ProcessEndOfStream(visitor); + } + + /// + /// Describes the contents of a CSV stream to the given instance of the + /// class. + /// + /// + /// The CSV stream to describe. + /// + /// + /// The instance to describe the stream to. + /// + /// + /// + /// An that will be notified every time the next chunk of the + /// stream is processed, with the size of the chunk (in bytes) that was processed. + /// + /// + /// All notifications will receive values less than or equal to the buffer size in bytes + /// (which, for this overload, is the default value of 81,920). + /// + /// + /// There will be one last notification with value 0 after the entire stream has been + /// processed and the final few stream elements have been consumed. + /// + /// + /// This may be left as if no progress notifications are needed. + /// + /// + /// + /// + /// An instance of that may be used to signal that results + /// are no longer needed, and so the method should terminate at its earliest convenience. + /// + /// + /// This may be left as its default value of if the + /// operation does not need to support cancellation. + /// + /// + /// + /// Thrown when is . + /// + /// + /// Thrown when does not support reading (i.e., + /// is ). + /// + /// + /// Thrown (perhaps asynchronously) to acknowledge cancellation. A derived exception, such + /// as , may also be thrown by the system. + /// + /// + /// Thrown (perhaps asynchronously) if the underlying + /// object backing is disposed before the asynchronous + /// operation terminates. + /// + public static ValueTask ProcessStreamAsync(Stream csvStream, CsvReaderVisitorBase visitor, IProgress progress = null, CancellationToken cancellationToken = default) => + ProcessStreamAsync(csvStream, visitor, 81920, progress, cancellationToken); + + /// + /// Describes the contents of a CSV stream to the given instance of the + /// class. + /// + /// + /// The CSV stream to describe. + /// + /// + /// The instance to describe the stream to. + /// + /// + /// The length of the buffer to use (default: 81920). + /// + /// + /// + /// An that will be notified every time the next chunk of the + /// stream is processed, with the size of the chunk (in bytes) that was processed. + /// + /// + /// All notifications will receive values less than or equal to the buffer size in bytes + /// (which, for this overload, is the value of ). + /// + /// + /// There will be one last notification with value 0 after the entire stream has been + /// processed and the final few stream elements have been consumed. + /// + /// + /// This may be left as if no progress notifications are needed. + /// + /// + /// + /// + /// An instance of that may be used to signal that results + /// are no longer needed, and so the method should terminate at its earliest convenience. + /// + /// + /// This may be left as its default value of if the + /// operation does not need to support cancellation. + /// + /// + /// + /// Thrown when is . + /// + /// + /// Thrown when is not greater than zero. + /// + /// + /// Thrown when does not support reading (i.e., + /// is ). + /// + /// + /// Thrown (perhaps asynchronously) to acknowledge cancellation. A derived exception, such + /// as , may also be thrown by the system. + /// + /// + /// Thrown (perhaps asynchronously) if the underlying + /// object backing is disposed before the asynchronous + /// operation terminates. + /// + public static async ValueTask ProcessStreamAsync(Stream csvStream, CsvReaderVisitorBase visitor, int bufferSize, IProgress progress = null, CancellationToken cancellationToken = default) + { + if (csvStream is null) + { + throw new ArgumentNullException(nameof(csvStream)); + } + + if (bufferSize <= 0) + { + throw new ArgumentOutOfRangeException(nameof(bufferSize), bufferSize, "Must be greater than zero."); + } + + if (!csvStream.CanRead) + { + throw new ArgumentException("Stream does not support reading.", nameof(csvStream)); + } + + byte[] buffer = new byte[bufferSize]; + var tokenizer = new CsvTokenizer(); + int cnt; + while ((cnt = await csvStream.ReadAsync(buffer, 0, buffer.Length, cancellationToken).ConfigureAwait(false)) != 0) + { + tokenizer.ProcessNextChunk(new ReadOnlySpan(buffer, 0, cnt), visitor); + progress?.Report(cnt); + + // not all streams support cancellation, so we might as well do this ourselves. it + // does involve a volatile read, so don't go overboard. + cancellationToken.ThrowIfCancellationRequested(); + } + + tokenizer.ProcessEndOfStream(visitor); + progress?.Report(0); + } + + /// + /// Describes the entire contents of a CSV file to the given instance of the + /// class. /// /// /// The path to the CSV file to describe. @@ -19,9 +238,46 @@ public static class Csv /// /// The instance to describe the file to. /// - public static unsafe void ProcessMemoryMappedFile(string csvFilePath, CsvReaderVisitorBase visitor) + /// + /// The current version of this method uses memory-mapping behind the scenes in order to + /// minimize the overhead of copying and cutting across discrete buffers, at the expense of + /// slightly more overhead to set up the memory map than a typical read-from-stream pattern. + /// + /// + /// See . + /// + /// + /// See . + /// + /// + /// See . + /// + /// + /// See . + /// + /// + /// + /// See . + /// + /// + /// See . + /// + /// + /// + /// See . + /// + /// + /// See . + /// + /// + /// See . + /// + /// + /// See . + /// + public static unsafe void ProcessEntireFile(string csvFilePath, CsvReaderVisitorBase visitor) { - using (var fl = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read)) + using (var fl = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan)) { long length = fl.Length; if (length == 0) @@ -35,19 +291,18 @@ public static unsafe void ProcessMemoryMappedFile(string csvFilePath, CsvReaderV { var handle = accessor.SafeMemoryMappedViewHandle; byte* ptr = null; + RuntimeHelpers.PrepareConstrainedRegions(); try { handle.AcquirePointer(ref ptr); - for (long rem = length; rem > 0; rem -= int.MaxValue) + while (length > int.MaxValue) { - int currentChunkLength = rem < int.MaxValue - ? unchecked((int)rem) - : int.MaxValue; - - var span = new ReadOnlySpan(ptr, currentChunkLength); - tokenizer.ProcessNextChunk(span, visitor); + tokenizer.ProcessNextChunk(new ReadOnlySpan(ptr, int.MaxValue), visitor); + length -= int.MaxValue; + ptr += int.MaxValue; } + tokenizer.ProcessNextChunk(new ReadOnlySpan(ptr, unchecked((int)length)), visitor); tokenizer.ProcessEndOfStream(visitor); } finally diff --git a/src/Cursively/Cursively.csproj b/src/Cursively/Cursively.csproj index 6f5c7ae..06e06e7 100644 --- a/src/Cursively/Cursively.csproj +++ b/src/Cursively/Cursively.csproj @@ -17,6 +17,7 @@ + diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs index ab36394..943359a 100644 --- a/test/Cursively.Benchmark/Program.cs +++ b/test/Cursively.Benchmark/Program.cs @@ -30,15 +30,6 @@ public long CountRowsUsingCursivelyByteArray(CsvFile csvFile) return visitor.RowCount; } - [Benchmark] - [ArgumentsSource(nameof(CsvFiles))] - public long CountRowsUsingCursivelyWithMemoryMappedFile(CsvFile csvFile) - { - var visitor = new RowCountingVisitor(); - Csv.ProcessMemoryMappedFile(csvFile.FullPath, visitor); - return visitor.RowCount; - } - [Benchmark] [ArgumentsSource(nameof(CsvFiles))] public long CountRowsUsingCsvHelper(CsvFile csvFile) @@ -63,8 +54,7 @@ private static int Main() foreach (var csvFile in CsvFiles) { long rowCount = prog.CountRowsUsingCursivelyByteArray(csvFile); - if (prog.CountRowsUsingCsvHelper(csvFile) != rowCount || - prog.CountRowsUsingCursivelyWithMemoryMappedFile(csvFile) != rowCount) + if (prog.CountRowsUsingCsvHelper(csvFile) != rowCount) { Console.Error.WriteLine($"Failed on {csvFile}."); return 1; diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs index 615c624..76a5717 100644 --- a/test/Cursively.Tests/CsvTokenizerTests.cs +++ b/test/Cursively.Tests/CsvTokenizerTests.cs @@ -88,7 +88,7 @@ public void MemoryMappedCsvShouldMatchCsvHelper(string filePath) var visitor = new StringBufferingVisitor(checked((int)new FileInfo(filePath).Length)); // act - Csv.ProcessMemoryMappedFile(filePath, visitor); + Csv.ProcessEntireFile(filePath, visitor); var actual = visitor.Lines; // assert From 8d364ab82a80fc800f07aa17b65c08faf09d8a0b Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 13 Jun 2019 09:45:39 -0400 Subject: [PATCH 10/22] Always set flags before invoking the visitor --- src/Cursively/CsvTokenizer.cs | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index 1a44af0..b8fdaf6 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -233,6 +233,10 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi continue; } + // loop one-by-one, instead of doing an IndexOfAny, greedily assuming that the most + // performance-sensitive applications will tend to have few enough bytes in each + // unquoted field that this manual inlining will benefit those applications **much** + // more than practically any IndexOfAny implementation would. for (int idx = 0; idx < chunk.Length; idx++) { byte c = chunk[idx]; @@ -247,14 +251,14 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi // RFC 4180 forbids quotes that show up anywhere but the beginning of a // field, so it's up to us to decide what we want to do about this. We // choose to treat all such quotes as just regular data. - visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1)); _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine; + visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1)); } } else if (c == delimiter) { - visitor.VisitEndOfField(chunk.Slice(0, idx)); _parserFlags = ParserFlags.ReadAnythingOnCurrentLine; + visitor.VisitEndOfField(chunk.Slice(0, idx)); } else if (c == CR || c == LF) { @@ -269,8 +273,8 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi goto nextLoop; } - visitor.VisitPartialFieldContents(chunk); _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine; + visitor.VisitPartialFieldContents(chunk); break; nextLoop:; @@ -329,8 +333,8 @@ private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisi // in fact, it should pay off so well in so many cases that we can probably even // get away with making the other case really suboptimal, which is what it will // do when we pick up where we leave off after setting this flag. - visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx)); _parserFlags |= ParserFlags.CutAtPotentiallyTerminalDoubleQuote; + visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx)); readBuffer = default; return; } @@ -348,8 +352,8 @@ private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisi // the double quote was the end of a quoted field, so send the entire data from // the beginning of this quoted field data chunk up to the double quote that // terminated it (excluding, of course, the double quote itself). - visitor.VisitEndOfField(readBuffer.Slice(0, idx)); _parserFlags = ParserFlags.ReadAnythingOnCurrentLine; + visitor.VisitEndOfField(readBuffer.Slice(0, idx)); } else if (b == CR || b == LF) { @@ -381,8 +385,8 @@ private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisi byte b = readBuffer[idx]; if (b == _delimiter) { - visitor.VisitEndOfField(readBuffer.Slice(0, idx)); _parserFlags = ParserFlags.ReadAnythingOnCurrentLine; + visitor.VisitEndOfField(readBuffer.Slice(0, idx)); } else if (b == CR || b == LF) { @@ -433,16 +437,17 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan lastFieldDataChunk, CsvReaderVisitorBase visitor) { - if (!lastFieldDataChunk.IsEmpty || (_parserFlags & ParserFlags.ReadAnythingOnCurrentLine) != 0) + // even if the last field data chunk is empty, we still need to send it: we might be + // looking at a newline that immediately follows a comma, which is defined to mean + // an empty field at the end of a line. + bool notify = !lastFieldDataChunk.IsEmpty || (_parserFlags & ParserFlags.ReadAnythingOnCurrentLine) != 0; + + _parserFlags = ParserFlags.None; + if (notify) { - // even if the last field data chunk is empty, we still need to send it: we might be - // looking at a newline that immediately follows a comma, which is defined to mean - // an empty field at the end of a line. visitor.VisitEndOfField(lastFieldDataChunk); visitor.VisitEndOfRecord(); } - - _parserFlags = ParserFlags.None; } } } From 35165fffcab8b8fa8fd1a21aa8bacaae67413781 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 13 Jun 2019 09:45:40 -0400 Subject: [PATCH 11/22] Give consumers a way to detect nonstandard data. Resolves #4 --- src/Cursively/CsvReaderVisitorBase.cs | 36 +++++++++++++++++++++++++-- src/Cursively/CsvTokenizer.cs | 22 ++++++++++++++-- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs index 5b0eea6..0a748fc 100644 --- a/src/Cursively/CsvReaderVisitorBase.cs +++ b/src/Cursively/CsvReaderVisitorBase.cs @@ -29,8 +29,9 @@ public abstract class CsvReaderVisitorBase /// This method may be called at any time. /// /// - /// Only and may be - /// called directly after a call to this method. + /// Only , , and + /// may be called directly after a call to this + /// method. /// /// /// There are multiple reasons why this method may be called instead of going straight to @@ -94,6 +95,37 @@ public abstract class CsvReaderVisitorBase /// public abstract void VisitEndOfRecord(); + /// + /// + /// Notifies that the current field contains double-quote characters that do not comply with + /// RFC 4180, and so it is being processed according to this library's extra rules. + /// + /// + /// The default behavior of this method is to do nothing. Subclasses may wish to override + /// to add warnings / errors when processing streams that do not follow RFC 4180 and are + /// therefore in danger of being processed differently than other tools. + /// + /// + /// + /// + /// This method may only be called after a call to , + /// at most once per field (i.e., once it is called, it may not be called again until the + /// next call to ). + /// + /// + /// Only and may be + /// called directly after a call to this method. + /// + /// + /// Once called, the entire field described by all preceding consecutive calls to + /// calls, and all successive calls up to the next + /// , are considered to be "nonstandard". That means that this + /// method may be considered to affect the correctness of previous method calls, depending + /// on the semantics of the override. + /// + /// + public virtual void VisitNonstandardQuotedField() { } + private sealed class NullVisitor : CsvReaderVisitorBase { public override void VisitEndOfRecord() { } diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index b8fdaf6..c04ade4 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -253,6 +253,9 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi // choose to treat all such quotes as just regular data. _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine; visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1)); + + // let the visitor know that this was nonstandard. + visitor.VisitNonstandardQuotedField(); } } else if (c == delimiter) @@ -370,6 +373,9 @@ private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisi _parserFlags |= ParserFlags.QuotedFieldDataEnded; visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx)); visitor.VisitPartialFieldContents(readBuffer.Slice(idx + 1, 1)); + + // let the visitor know that this was nonstandard. + visitor.VisitNonstandardQuotedField(); } // slice off the data up to the quote and the next byte that we read. @@ -379,7 +385,9 @@ private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisi { // this is expected to be rare: either we were cut between field reads, or we're // reading nonstandard field data where there's a quote that neither starts nor ends - // the field. + // the field; by this point, we don't save enough state to remember which case we're + // in, so VisitNonstandardQuotedField **MUST** have been correctly called (or not) + // before entering this section. for (int idx = 0; idx < readBuffer.Length; idx++) { byte b = readBuffer[idx]; @@ -416,7 +424,9 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan Date: Thu, 13 Jun 2019 09:50:01 -0400 Subject: [PATCH 12/22] fix and improve xmldoc --- src/Cursively/CsvReaderVisitorBase.cs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs index 0a748fc..a0118aa 100644 --- a/src/Cursively/CsvReaderVisitorBase.cs +++ b/src/Cursively/CsvReaderVisitorBase.cs @@ -71,7 +71,8 @@ public abstract class CsvReaderVisitorBase /// This method may be called at any time. /// /// - /// Any method, including this one, may be called directly after a call to this method. + /// Any method except , including this one, may be + /// called directly after a call to this method. /// /// /// This method may be called without a preceding @@ -109,8 +110,8 @@ public abstract class CsvReaderVisitorBase /// /// /// This method may only be called after a call to , - /// at most once per field (i.e., once it is called, it may not be called again until the - /// next call to ). + /// at most once per field (i.e., once it is called, it may not be called again until after + /// the next time that is called). /// /// /// Only and may be From 4f40606bf7d92fb137048ff83f004ead26d0c617 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 13 Jun 2019 09:53:37 -0400 Subject: [PATCH 13/22] improve xmldoc --- src/Cursively/CsvReaderVisitorBase.cs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs index a0118aa..09f25bb 100644 --- a/src/Cursively/CsvReaderVisitorBase.cs +++ b/src/Cursively/CsvReaderVisitorBase.cs @@ -109,9 +109,10 @@ public abstract class CsvReaderVisitorBase /// /// /// - /// This method may only be called after a call to , - /// at most once per field (i.e., once it is called, it may not be called again until after - /// the next time that is called). + /// This method may only be called as the very next method that gets called after a call to + /// , and only at most once per field (i.e., once it + /// is called, it may not be called again until a call brings + /// the tokenizer back to a state where RFC 4180 rules are expected). /// /// /// Only and may be From 39cf1d0c8a1ea26293ec49eb66f3bbb212bdece9 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Fri, 14 Jun 2019 08:52:50 -0400 Subject: [PATCH 14/22] Add nonstandard field test --- src/Cursively/CsvReaderVisitorBase.cs | 14 +++-- test/Cursively.Tests/CsvTokenizerTests.cs | 72 +++++++++++++++++++++-- 2 files changed, 77 insertions(+), 9 deletions(-) diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs index 09f25bb..d2a5410 100644 --- a/src/Cursively/CsvReaderVisitorBase.cs +++ b/src/Cursively/CsvReaderVisitorBase.cs @@ -119,11 +119,15 @@ public abstract class CsvReaderVisitorBase /// called directly after a call to this method. /// /// - /// Once called, the entire field described by all preceding consecutive calls to - /// calls, and all successive calls up to the next - /// , are considered to be "nonstandard". That means that this - /// method may be considered to affect the correctness of previous method calls, depending - /// on the semantics of the override. + /// The last byte in the preceding call's chunk will + /// be the specific byte that was unexpected; all bytes before it were legal under RFC 4180. + /// So if this event is being raised because the tokenizer found a double-quote in a field + /// that did not start with a double-quote, then was + /// previously called with a chunk that ended with that double-quote. If it's being raised + /// because a double-quote was found in a quoted field that was not immediately followed by + /// a double-quote, delimiter, or line ending, then + /// was previously called with a chunk that ended with whichever byte immediately followed + /// the double-quote that ended the quoted part of the quoted field data. /// /// public virtual void VisitNonstandardQuotedField() { } diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs index 76a5717..add2793 100644 --- a/test/Cursively.Tests/CsvTokenizerTests.cs +++ b/test/Cursively.Tests/CsvTokenizerTests.cs @@ -89,13 +89,36 @@ public void MemoryMappedCsvShouldMatchCsvHelper(string filePath) // act Csv.ProcessEntireFile(filePath, visitor); - var actual = visitor.Lines; + var actual = visitor.Records; // assert var expected = TokenizeCsvFileUsingCsvHelper(File.ReadAllBytes(filePath), ","); Assert.Equal(expected, actual); } + [Fact] + public void NonstandardQuotedFieldsShouldNotify() + { + // arrange + string csvFilePath = Path.Combine(TestCsvFilesFolderPath, "nonstandard.csv"); + var visitor = new NonstandardFieldVisitor(checked((int)new FileInfo(csvFilePath).Length)); + + // act + Csv.ProcessEntireFile(csvFilePath, visitor); + + // assert + string[] expectedContentsBeforeNonstandardFields = + { + "hello ", + "hello ", + "good\"", + @"100% coverage, with the version of Roslyn shipped with the .NET Core 3.0 Preview 4 SDK version, is impossible... +...unless I do something like making the byte immediately after this quoted field something with an ASCII value less than 13 that's not 10. +Tab ('\t') has an ASCII value of 9, which is perfect for this. so here's your tab: ", + }; + Assert.Equal(expectedContentsBeforeNonstandardFields, visitor.ContentsBeforeNonstandardFields); + } + private static List TokenizeCsvFileUsingCursively(ReadOnlySpan fileData, int chunkLength, byte delimiter) { var tokenizer = new CsvTokenizer(delimiter); @@ -108,7 +131,7 @@ private static List TokenizeCsvFileUsingCursively(ReadOnlySpan f tokenizer.ProcessNextChunk(fileData, visitor); tokenizer.ProcessEndOfStream(visitor); - return visitor.Lines; + return visitor.Records; } private static IEnumerable TokenizeCsvFileUsingCsvHelper(byte[] csvData, string delimiter) @@ -186,11 +209,11 @@ private sealed class StringBufferingVisitor : CsvReaderVisitorBase public StringBufferingVisitor(int fileLength) => _cutBuffer = new byte[fileLength]; - public List Lines { get; } = new List(); + public List Records { get; } = new List(); public override void VisitEndOfRecord() { - Lines.Add(_fields.ToArray()); + Records.Add(_fields.ToArray()); _fields.Clear(); } @@ -214,5 +237,46 @@ private void CopyToCutBuffer(ReadOnlySpan chunk) _cutBufferConsumed += chunk.Length; } } + + private sealed class NonstandardFieldVisitor : CsvReaderVisitorBase + { + private readonly Decoder _decoder = new UTF8Encoding(false, true).GetDecoder(); + + private readonly char[] _fieldBuffer; + + private int _fieldBufferConsumed; + + public NonstandardFieldVisitor(int byteCount) => + _fieldBuffer = new char[Encoding.UTF8.GetMaxCharCount(byteCount)]; + + public override void VisitEndOfField(ReadOnlySpan chunk) + { + VisitFieldContents(chunk, true); + _fieldBufferConsumed = 0; + } + + public List ContentsBeforeNonstandardFields { get; } = new List(); + + public override void VisitEndOfRecord() { } + + public override void VisitPartialFieldContents(ReadOnlySpan chunk) => + VisitFieldContents(chunk, false); + + public override void VisitNonstandardQuotedField() + { + VisitFieldContents(default, true); + ContentsBeforeNonstandardFields.Add(new string(_fieldBuffer, 0, _fieldBufferConsumed)); + } + + private void VisitFieldContents(ReadOnlySpan chunk, bool flush) + { + int cnt = _decoder.GetCharCount(chunk, flush); + if (cnt > 0) + { + _decoder.GetChars(chunk, new Span(_fieldBuffer, _fieldBufferConsumed, cnt), flush); + _fieldBufferConsumed += cnt; + } + } + } } } From 4a13e857ff75e0d0a966e8e20b723231c077619b Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Fri, 14 Jun 2019 09:55:49 -0400 Subject: [PATCH 15/22] Archive the benchmark files. They'll be decompressed at the start of the benchmark runs. --- .gitattributes | 2 +- test/Cursively.Benchmark/.gitignore | 1 + test/Cursively.Benchmark/Program.cs | 24 ++++++++++++++++--- test/Cursively.Benchmark/large-csv-files.zip | 3 +++ .../100-huge-records-quoted.csv | 3 --- .../large-csv-files/100-huge-records.csv | 3 --- .../large-csv-files/10k-empty-records.csv | 3 --- .../large-csv-files/mocked.csv | 3 --- .../large-csv-files/worldcitiespop.csv | 3 --- 9 files changed, 26 insertions(+), 19 deletions(-) create mode 100644 test/Cursively.Benchmark/.gitignore create mode 100644 test/Cursively.Benchmark/large-csv-files.zip delete mode 100644 test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv delete mode 100644 test/Cursively.Benchmark/large-csv-files/100-huge-records.csv delete mode 100644 test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv delete mode 100644 test/Cursively.Benchmark/large-csv-files/mocked.csv delete mode 100644 test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv diff --git a/.gitattributes b/.gitattributes index a2ad6cb..cd8c663 100644 --- a/.gitattributes +++ b/.gitattributes @@ -57,4 +57,4 @@ #*.PDF diff=astextplain #*.rtf diff=astextplain #*.RTF diff=astextplain -**/large-csv-files/** filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text diff --git a/test/Cursively.Benchmark/.gitignore b/test/Cursively.Benchmark/.gitignore new file mode 100644 index 0000000..ff63206 --- /dev/null +++ b/test/Cursively.Benchmark/.gitignore @@ -0,0 +1 @@ +large-csv-files/*.csv diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs index 943359a..3da0da3 100644 --- a/test/Cursively.Benchmark/Program.cs +++ b/test/Cursively.Benchmark/Program.cs @@ -1,5 +1,6 @@ using System; using System.IO; +using System.IO.Compression; using System.Runtime.CompilerServices; using System.Text; @@ -79,9 +80,26 @@ public CsvFile(string fullPath) => public override string ToString() => FileName; } - private static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null) => - Array.ConvertAll(Directory.GetFiles(Path.Combine(Path.GetDirectoryName(myLocation), "large-csv-files"), "*.csv"), - fullPath => new CsvFile(fullPath)); + private static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null) + { + string csvFileDirectoryPath = Path.Combine(Path.GetDirectoryName(myLocation), "large-csv-files"); + if (!Directory.Exists(csvFileDirectoryPath)) + { + string tmpDirectoryPath = csvFileDirectoryPath + "-tmp"; + if (Directory.Exists(tmpDirectoryPath)) + { + Directory.Delete(tmpDirectoryPath, true); + } + + string zipFilePath = csvFileDirectoryPath + ".zip"; + Directory.CreateDirectory(tmpDirectoryPath); + ZipFile.ExtractToDirectory(zipFilePath, tmpDirectoryPath); + Directory.Move(tmpDirectoryPath, csvFileDirectoryPath); + } + + return Array.ConvertAll(Directory.GetFiles(csvFileDirectoryPath, "*.csv"), + fullPath => new CsvFile(fullPath)); + } private sealed class RowCountingVisitor : CsvReaderVisitorBase { diff --git a/test/Cursively.Benchmark/large-csv-files.zip b/test/Cursively.Benchmark/large-csv-files.zip new file mode 100644 index 0000000..2296075 --- /dev/null +++ b/test/Cursively.Benchmark/large-csv-files.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc70c8d20921546b1fa4e587859a22a2edfce76325d8e5bc780b98a78409a76d +size 47747942 diff --git a/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv b/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv deleted file mode 100644 index 718947c..0000000 --- a/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:611a7ba4f69bf3ab34f1fbf3fbf4711bfa8fb91a210683bdf4c1915818f1cfe0 -size 4900444 diff --git a/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv b/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv deleted file mode 100644 index fde3ed5..0000000 --- a/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3e82c977d84c24a6b16063b634cbeab1e8409b34724b0ecf07893f45f8aadb53 -size 2900444 diff --git a/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv b/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv deleted file mode 100644 index 61dd063..0000000 --- a/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f1e211bf4eb14ab578ccf6aff141e8db41e80314b39b85fba5f047830f746e4 -size 10020000 diff --git a/test/Cursively.Benchmark/large-csv-files/mocked.csv b/test/Cursively.Benchmark/large-csv-files/mocked.csv deleted file mode 100644 index 4b45c74..0000000 --- a/test/Cursively.Benchmark/large-csv-files/mocked.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e01c74f0a9622e4ad72233ff35bfcc2663eca10b558d0d7e7f71932c6c981d4b -size 12731500 diff --git a/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv b/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv deleted file mode 100644 index 01dfbfe..0000000 --- a/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4c7824338bbbc228f5d3e0089c57233136e83853821252ade7ed556b4bcfc1b -size 151492068 From 27e153198b9d3c312f4a11a65dada212b11042b2 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sat, 15 Jun 2019 07:52:43 -0400 Subject: [PATCH 16/22] Rename this back down. --- test/Cursively.Benchmark/Program.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs index 3da0da3..c946b5a 100644 --- a/test/Cursively.Benchmark/Program.cs +++ b/test/Cursively.Benchmark/Program.cs @@ -22,7 +22,7 @@ public class Program [Benchmark(Baseline = true)] [ArgumentsSource(nameof(CsvFiles))] - public long CountRowsUsingCursivelyByteArray(CsvFile csvFile) + public long CountRowsUsingCursively(CsvFile csvFile) { var visitor = new RowCountingVisitor(); var tokenizer = new CsvTokenizer(); @@ -54,7 +54,7 @@ private static int Main() var prog = new Program(); foreach (var csvFile in CsvFiles) { - long rowCount = prog.CountRowsUsingCursivelyByteArray(csvFile); + long rowCount = prog.CountRowsUsingCursively(csvFile); if (prog.CountRowsUsingCsvHelper(csvFile) != rowCount) { Console.Error.WriteLine($"Failed on {csvFile}."); From 918e3060233a9daa1e6338ddc4fc5bf1be31f564 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sat, 15 Jun 2019 10:37:27 -0400 Subject: [PATCH 17/22] Reorder these checks. --- src/Cursively/CsvTokenizer.cs | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index c04ade4..61955cc 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -240,7 +240,16 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi for (int idx = 0; idx < chunk.Length; idx++) { byte c = chunk[idx]; - if (c == QUOTE) + if (c == delimiter) + { + _parserFlags = ParserFlags.ReadAnythingOnCurrentLine; + visitor.VisitEndOfField(chunk.Slice(0, idx)); + } + else if (c == CR || c == LF) + { + ProcessEndOfLine(chunk.Slice(0, idx), visitor); + } + else if (c == QUOTE) { if (idx == 0) { @@ -258,15 +267,6 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi visitor.VisitNonstandardQuotedField(); } } - else if (c == delimiter) - { - _parserFlags = ParserFlags.ReadAnythingOnCurrentLine; - visitor.VisitEndOfField(chunk.Slice(0, idx)); - } - else if (c == CR || c == LF) - { - ProcessEndOfLine(chunk.Slice(0, idx), visitor); - } else { continue; @@ -311,12 +311,6 @@ public void ProcessEndOfStream(CsvReaderVisitorBase visitor) private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisitorBase visitor) { - if ((_parserFlags & ParserFlags.CutAtPotentiallyTerminalDoubleQuote) != 0) - { - HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref readBuffer, visitor); - return; - } - if ((_parserFlags & (ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.QuotedFieldDataEnded)) == ParserFlags.CurrentFieldStartedWithQuote) { int idx = readBuffer.IndexOf(QUOTE); @@ -388,6 +382,12 @@ private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisi // the field; by this point, we don't save enough state to remember which case we're // in, so VisitNonstandardQuotedField **MUST** have been correctly called (or not) // before entering this section. + if ((_parserFlags & ParserFlags.CutAtPotentiallyTerminalDoubleQuote) != 0) + { + HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref readBuffer, visitor); + return; + } + for (int idx = 0; idx < readBuffer.Length; idx++) { byte b = readBuffer[idx]; From 2f1f7cc4c19a9286bf2dfa0b3db8af2adc40e1aa Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sat, 15 Jun 2019 10:42:21 -0400 Subject: [PATCH 18/22] Fix an oversight in the previous commit. Fields cut at potentially terminal double-quotes are, of course, quoted. --- src/Cursively/CsvTokenizer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index 61955cc..ce80a4d 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -311,7 +311,7 @@ public void ProcessEndOfStream(CsvReaderVisitorBase visitor) private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisitorBase visitor) { - if ((_parserFlags & (ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.QuotedFieldDataEnded)) == ParserFlags.CurrentFieldStartedWithQuote) + if ((_parserFlags & (ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.QuotedFieldDataEnded | ParserFlags.CutAtPotentiallyTerminalDoubleQuote)) == ParserFlags.CurrentFieldStartedWithQuote) { int idx = readBuffer.IndexOf(QUOTE); if (idx < 0) From 3c73bd040859492dd3d46c97d12fed1eeb103d61 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sat, 15 Jun 2019 14:32:56 -0400 Subject: [PATCH 19/22] Add first-class support for headered CSV streams. This support comes in the form of an intermediate visitor base class that has multiple built-in protections designed to help thwart DDoS attacks. Resolves #5 --- .../CsvReaderVisitorWithUTF8HeadersBase.cs | 471 ++++++++++++++++++ src/Cursively/CsvTokenizer.cs | 10 +- src/Cursively/Cursively.csproj | 1 + src/Cursively/CursivelyDataStreamException.cs | 31 ++ .../CursivelyDecoderExceptionFallback.cs | 42 ++ .../CursivelyExtraDataFieldsException.cs | 25 + .../CursivelyHeaderIsTooLongException.cs | 25 + .../CursivelyHeadersAreNotUTF8Exception.cs | 32 ++ .../CursivelyMissingDataFieldsException.cs | 25 + .../CursivelyTooManyHeadersException.cs | 25 + test/Cursively.Tests/CsvTokenizerTests.cs | 140 +++++- test/Cursively.Tests/Cursively.Tests.csproj | 2 +- .../invalid/invalid-utf8-in-header.csv | 2 + .../invalid/missing-data-fields.csv | 3 + .../invalid/too-many-data-fields.csv | 2 + .../valid/invalid-utf8-outside-header.csv | 2 + .../with-headers/valid/simple.csv | 3 + 17 files changed, 825 insertions(+), 16 deletions(-) create mode 100644 src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs create mode 100644 src/Cursively/CursivelyDataStreamException.cs create mode 100644 src/Cursively/CursivelyDecoderExceptionFallback.cs create mode 100644 src/Cursively/CursivelyExtraDataFieldsException.cs create mode 100644 src/Cursively/CursivelyHeaderIsTooLongException.cs create mode 100644 src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs create mode 100644 src/Cursively/CursivelyMissingDataFieldsException.cs create mode 100644 src/Cursively/CursivelyTooManyHeadersException.cs create mode 100644 test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv create mode 100644 test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv create mode 100644 test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv create mode 100644 test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv create mode 100644 test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv diff --git a/src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs b/src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs new file mode 100644 index 0000000..dae9ca4 --- /dev/null +++ b/src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs @@ -0,0 +1,471 @@ +using System; +using System.Collections.Immutable; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Text; + +namespace Cursively +{ + /// + /// + /// Intermediate base class for CSV reader visitors that don't want to have to implement header + /// handling by themselves. + /// + /// + /// Instances of this class are tied to a single CSV stream and cannot be reused or reset for + /// use with other CSV streams. + /// + /// + /// Each instance of this visitor has an upper-bound on the maximum number of headers and on the + /// maximum length of each header. CSV streams that exceed these limits will cause this class + /// to throw exceptions, and behavior of a particular instance is undefined once this happens. + /// + /// + /// + /// + /// The following input-dependent exceptions may get thrown when using this visitor, all of + /// which inherit from : + /// + /// + /// + /// + /// if is + /// being used and the CSV stream contains a sequence of invalid UTF-8 bytes. + /// + /// + /// + /// + /// if the CSV stream contains one or more + /// headers that are longer than the configured maximum. + /// + /// + /// + /// + /// if the CSV stream contains more headers than + /// the configured maximum. + /// + /// + /// + /// + /// , by default, if a data record contains more + /// fields than the header record. + /// + /// + /// + /// + /// , by default, if a data record contains more + /// fields than the header record. + /// + /// + /// + /// + public abstract class CsvReaderVisitorWithUTF8HeadersBase : CsvReaderVisitorBase + { + /// + /// The value used by to initialize the + /// maximum number of headers (1,000). + /// + protected static readonly int DefaultMaxHeaderCount = 1_000; + + /// + /// The value used by to initialize the + /// maximum length, in UTF-16 code units, of a single header (100). + /// + protected static readonly int DefaultMaxHeaderLength = 100; + + /// + /// The value used by to initialize the + /// value indicating whether or not to ignore a leading UTF-8 BOM (true). + /// + protected static readonly bool DefaultIgnoreUTF8IdentifierOnFirstHeaderField = true; + + /// + /// The value used by to initialize the + /// fallback logic when the decoder encounters invalid UTF-8 bytes (throw an exception). + /// + protected static readonly DecoderFallback DefaultDecoderFallback = new CursivelyDecoderExceptionFallback(); + + private static readonly UTF8Encoding EncodingToUse = new UTF8Encoding(false, false); + + private readonly Decoder _headerDecoder; + + private readonly ImmutableArray.Builder _headersBuilder; + + private readonly bool _ignoreUTF8IdentifierOnFirstHeaderField; + + private char[] _headerBuffer; + + private ImmutableArray _headers; + + private int _headerBufferConsumed; + + private int _currentFieldIndex = -1; + + /// + /// Initializes a new instance of the class. + /// + protected CsvReaderVisitorWithUTF8HeadersBase() + : this(maxHeaderCount: DefaultMaxHeaderCount, + maxHeaderLength: DefaultMaxHeaderLength, + ignoreUTF8IdentifierOnFirstHeaderField: DefaultIgnoreUTF8IdentifierOnFirstHeaderField, + decoderFallback: DefaultDecoderFallback) + { + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// The maximum number of headers to allow. + /// Default: . + /// + /// + /// The maximum length, in UTF-16 code units, of any particular header. + /// Default: . + /// + /// + /// A value indicating whether or not to ignore a leading UTF-8 BOM. + /// Default: . + /// + /// + /// The fallback logic used when the decoder encounters invalid UTF-8 bytes. + /// Default: . + /// + /// + /// Thrown when is . + /// + /// + /// Thrown when or is + /// less than 1. + /// + protected CsvReaderVisitorWithUTF8HeadersBase(int maxHeaderCount, int maxHeaderLength, bool ignoreUTF8IdentifierOnFirstHeaderField, DecoderFallback decoderFallback) + { + if (maxHeaderCount < 1) + { + throw new ArgumentOutOfRangeException(nameof(maxHeaderCount), maxHeaderCount, "Must be greater than zero."); + } + + if (maxHeaderLength < 1) + { + throw new ArgumentOutOfRangeException(nameof(maxHeaderLength), maxHeaderLength, "Must be greater than zero."); + } + + if (decoderFallback is null) + { + throw new ArgumentNullException(nameof(decoderFallback)); + } + + _ignoreUTF8IdentifierOnFirstHeaderField = ignoreUTF8IdentifierOnFirstHeaderField; + + _headersBuilder = ImmutableArray.CreateBuilder(maxHeaderCount); + + _headerBuffer = new char[maxHeaderLength]; + + _headerDecoder = EncodingToUse.GetDecoder(); + _headerDecoder.Fallback = decoderFallback; + } + + /// + /// + /// Gets the headers of the CSV stream. + /// + /// + /// Only valid after has been called. + /// + /// + /// + /// Thrown when trying to access this value before has + /// been called. + /// + /// + /// Once initialized, the value will remain the same for as long as this object instance + /// stays alive. + /// + protected ImmutableArray Headers + { + get + { + if (_headers.IsDefault) + { + ThrowExceptionWhenHeadersAreStillBeingBuilt(); + } + + return _headers; + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ThrowExceptionWhenHeadersAreStillBeingBuilt() => + throw new InvalidOperationException("Headers are still being built."); + + /// + /// Gets the zero-based index of the field that is currently being read. The value should + /// be the length of during and + /// , except after or + /// has been called. + /// + protected int CurrentFieldIndex => _currentFieldIndex; + + /// + public sealed override void VisitPartialFieldContents(ReadOnlySpan chunk) + { + if (_headers.IsDefault || _currentFieldIndex >= _headers.Length) + { + VisitPartialFieldContentsSlow(chunk); + } + else + { + VisitPartialDataFieldContents(chunk); + } + } + + /// + public sealed override void VisitEndOfField(ReadOnlySpan chunk) + { + if (_headers.IsDefault || _currentFieldIndex >= _headers.Length) + { + VisitEndOfFieldSlow(chunk); + } + else + { + VisitEndOfDataField(chunk); + ++_currentFieldIndex; + } + } + + /// + public sealed override void VisitEndOfRecord() + { + if (_headers.IsDefault || _currentFieldIndex != _headers.Length) + { + VisitEndOfRecordSlow(); + } + else + { + VisitEndOfDataRecord(); + _currentFieldIndex = 0; + } + } + + /// + /// + /// Notifies that all headers have been read and is safe to read. + /// + /// + /// The default behavior is to do nothing. + /// + /// + protected virtual void VisitEndOfHeaderRecord() { } + + /// + /// Visits part of a non-header field's data. + /// + /// + /// The data from this part of the field. + /// + /// + /// See documentation for for + /// details about when and how this method will be called. + /// + protected abstract void VisitPartialDataFieldContents(ReadOnlySpan chunk); + + /// + /// Visits the last part of a non-header field's data. + /// + /// + /// The data from the last part of the field. + /// + /// + /// See documentation for for + /// details about when and how this method will be called. + /// + protected abstract void VisitEndOfDataField(ReadOnlySpan chunk); + + /// + /// Notifies that all fields in the current non-header record have been visited. + /// + /// + /// See documentation for for + /// details about when and how this method will be called. + /// + protected abstract void VisitEndOfDataRecord(); + + /// + /// + /// Notifies that the current non-header record is about to be terminated without reading + /// all the fields that were identified in the header record. + /// + /// + /// The default behavior is to throw . + /// + /// + protected virtual void VisitMissingDataFields() + { + if (_headers.IsDefault) + { + // we will never do this, but a cheeky subclass might. + throw new InvalidOperationException("This method is only intended to be called by the base class."); + } + + throw new CursivelyMissingDataFieldsException(_headers.Length, _currentFieldIndex); + } + + /// + /// + /// Notifies that data for a field is about to be read on a non-header record, but all the + /// fields that were identified in the header record have already been read. + /// + /// + /// This method is called before every single or + /// call for fields not present in the header record. + /// + /// + /// The default behavior is to throw . + /// + /// + protected virtual void VisitUnexpectedDataField() + { + if (_headers.IsDefault) + { + // we will never do this, but a cheeky subclass might. + throw new InvalidOperationException("This method is only intended to be called by the base class."); + } + + throw new CursivelyExtraDataFieldsException(_headers.Length); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe void VisitPartialFieldContentsSlow(ReadOnlySpan chunk) + { + if (_headers.IsDefault) + { + if (_headersBuilder.Capacity == _headersBuilder.Count) + { + throw new CursivelyTooManyHeadersException(_headersBuilder.Capacity); + } + + if (chunk.IsEmpty) + { + // the tokenizer will never do this, but an external caller might. + return; + } + + fixed (byte* b = &chunk[0]) + { + VisitHeaderChunk(b, chunk.Length, false); + } + } + else + { + Debug.Assert(_currentFieldIndex >= _headers.Length, "Another condition brought us into VisitPartialFieldContentsSlow without updating this bit."); + VisitUnexpectedDataField(); + VisitPartialDataFieldContents(chunk); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe void VisitEndOfFieldSlow(ReadOnlySpan chunk) + { + if (_headers.IsDefault) + { + if (_headersBuilder.Capacity == _headersBuilder.Count) + { + throw new CursivelyTooManyHeadersException(_headersBuilder.Capacity); + } + + if (chunk.IsEmpty) + { + // the tokenizer will never do this, but an external caller might. note that + // the Decoder methods require a non-null pointer, even if the length is zero. + byte b = 0xFF; + VisitHeaderChunk(&b, 0, true); + } + else + { + fixed (byte* b = &chunk[0]) + { + VisitHeaderChunk(b, chunk.Length, true); + } + } + + int headerBufferOffset = 0; + + if (_headersBuilder.Count == 0 && + _ignoreUTF8IdentifierOnFirstHeaderField && + _headerBufferConsumed > 0 && + _headerBuffer[0] == '\uFEFF') + { + headerBufferOffset = 1; + } + + _headersBuilder.Add(new string(_headerBuffer, headerBufferOffset, _headerBufferConsumed - headerBufferOffset)); + _headerBufferConsumed = 0; + ++_currentFieldIndex; + } + else + { + Debug.Assert(_currentFieldIndex >= _headers.Length, "Another condition brought us into VisitEndOfFieldSlow without updating this bit."); + VisitUnexpectedDataField(); + VisitEndOfDataField(chunk); + _currentFieldIndex = checked(_currentFieldIndex + 1); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private void VisitEndOfRecordSlow() + { + if (_headers.IsDefault) + { + if (_headersBuilder.Count == 0) + { + // the tokenizer will never do this, but an external caller might. + throw new InvalidOperationException("No fields were present in the header record."); + } + + // this is almost equivalent to setting _headers = _headersBuilder.ToImmutable(), + // but this does a better job rewarding people for setting the max field count to + // the actual field count, which will often be the case. + _headersBuilder.Capacity = _headersBuilder.Count; + _headers = _headersBuilder.MoveToImmutable(); + _currentFieldIndex = _headers.Length; + + // we're done building headers, so free up our buffer. + _headerBuffer = null; + + // let the subclass know that the headers are ready, in case it wants to set up some + // stuff before the field data starts rolling in. + VisitEndOfHeaderRecord(); + } + else + { + Debug.Assert(_currentFieldIndex != _headers.Length, "Another condition brought us into VisitEndOfRecordSlow without updating this bit."); + if (_currentFieldIndex < _headers.Length) + { + VisitMissingDataFields(); + } + + VisitEndOfDataRecord(); + } + + _currentFieldIndex = 0; + } + + private unsafe void VisitHeaderChunk(byte* b, int byteCount, bool flush) + { + int charCount = _headerDecoder.GetCharCount(b, byteCount, flush); + if (_headerBufferConsumed + charCount <= _headerBuffer.Length) + { + fixed (char* c = &_headerBuffer[_headerBufferConsumed]) + { + _headerDecoder.GetChars(b, byteCount, c, charCount, flush); + } + } + else + { + throw new CursivelyHeaderIsTooLongException(_headerBuffer.Length); + } + + _headerBufferConsumed += charCount; + } + } +} diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index ce80a4d..0595009 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -247,7 +247,7 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi } else if (c == CR || c == LF) { - ProcessEndOfLine(chunk.Slice(0, idx), visitor); + ProcessEndOfRecord(chunk.Slice(0, idx), visitor); } else if (c == QUOTE) { @@ -306,7 +306,7 @@ public void ProcessEndOfStream(CsvReaderVisitorBase visitor) visitor = CsvReaderVisitorBase.Null; } - ProcessEndOfLine(default, visitor); + ProcessEndOfRecord(default, visitor); } private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisitorBase visitor) @@ -356,7 +356,7 @@ private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisi { // same thing as the delimiter case, just the field ended at the end of a line // instead of the end of a field on the current line. - ProcessEndOfLine(readBuffer.Slice(0, idx), visitor); + ProcessEndOfRecord(readBuffer.Slice(0, idx), visitor); } else { @@ -398,7 +398,7 @@ private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisi } else if (b == CR || b == LF) { - ProcessEndOfLine(readBuffer.Slice(0, idx), visitor); + ProcessEndOfRecord(readBuffer.Slice(0, idx), visitor); } else { @@ -453,7 +453,7 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan lastFieldDataChunk, CsvReaderVisitorBase visitor) + private void ProcessEndOfRecord(ReadOnlySpan lastFieldDataChunk, CsvReaderVisitorBase visitor) { // even if the last field data chunk is empty, we still need to send it: we might be // looking at a newline that immediately follows a comma, which is defined to mean diff --git a/src/Cursively/Cursively.csproj b/src/Cursively/Cursively.csproj index 06e06e7..ebdabd9 100644 --- a/src/Cursively/Cursively.csproj +++ b/src/Cursively/Cursively.csproj @@ -16,6 +16,7 @@ + diff --git a/src/Cursively/CursivelyDataStreamException.cs b/src/Cursively/CursivelyDataStreamException.cs new file mode 100644 index 0000000..9926d37 --- /dev/null +++ b/src/Cursively/CursivelyDataStreamException.cs @@ -0,0 +1,31 @@ +using System; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.Serialization; + +namespace Cursively +{ + /// + /// Serves as the base class for exceptions thrown by this library to indicate problems with the + /// actual contents of a CSV stream. + /// + [Serializable] + [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")] + public abstract class CursivelyDataStreamException : Exception + { + private protected CursivelyDataStreamException(string message) + : base(message) + { + } + + private protected CursivelyDataStreamException(string message, Exception innerException) + : base(message, innerException) + { + } + + [SuppressMessage("Microsoft.Usage", "CA2229:ImplementSerializationConstructors")] + private protected CursivelyDataStreamException(SerializationInfo info, StreamingContext context) + : base(info, context) + { + } + } +} diff --git a/src/Cursively/CursivelyDecoderExceptionFallback.cs b/src/Cursively/CursivelyDecoderExceptionFallback.cs new file mode 100644 index 0000000..22192ea --- /dev/null +++ b/src/Cursively/CursivelyDecoderExceptionFallback.cs @@ -0,0 +1,42 @@ +using System.Text; + +namespace Cursively +{ + internal sealed class CursivelyDecoderExceptionFallback : DecoderFallback + { + public override int MaxCharCount => 0; + + public override DecoderFallbackBuffer CreateFallbackBuffer() => new CursivelyDecoderExceptionFallbackBuffer(); + + public override bool Equals(object obj) => obj is CursivelyDecoderExceptionFallback; + + public override int GetHashCode() => 1234; + + private sealed class CursivelyDecoderExceptionFallbackBuffer : DecoderFallbackBuffer + { + public override int Remaining => 0; + + public override char GetNextChar() => '\0'; + + public override bool MovePrevious() => false; + + public override bool Fallback(byte[] bytesUnknown, int index) + { + // use the built-in logic to get a helpful exception message. + var inner = new DecoderExceptionFallbackBuffer(); + try + { + return inner.Fallback(bytesUnknown, index); + } + catch (DecoderFallbackException ex) + { + // wrap it. C# / .NET do not support multiple inheritance, and I think it's + // more important for consumers to be able to catch CursivelyDataStreamException + // for all exceptions in the form of "this breaks one of Cursively's rules, but + // the system is otherwise operating normally". + throw new CursivelyHeadersAreNotUTF8Exception(ex); + } + } + } + } +} diff --git a/src/Cursively/CursivelyExtraDataFieldsException.cs b/src/Cursively/CursivelyExtraDataFieldsException.cs new file mode 100644 index 0000000..71996b1 --- /dev/null +++ b/src/Cursively/CursivelyExtraDataFieldsException.cs @@ -0,0 +1,25 @@ +using System; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.Serialization; + +namespace Cursively +{ + /// + /// Raised by , by default, when a data record + /// contains more fields than the header record. + /// + [Serializable] + [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")] + public sealed class CursivelyExtraDataFieldsException : CursivelyDataStreamException + { + internal CursivelyExtraDataFieldsException(int headerFieldCount) + : base($"CSV stream contains a non-header record with more fields than the {headerFieldCount} field(s) present in the header record.") + { + } + + private CursivelyExtraDataFieldsException(SerializationInfo info, StreamingContext context) + : base(info, context) + { + } + } +} diff --git a/src/Cursively/CursivelyHeaderIsTooLongException.cs b/src/Cursively/CursivelyHeaderIsTooLongException.cs new file mode 100644 index 0000000..882098e --- /dev/null +++ b/src/Cursively/CursivelyHeaderIsTooLongException.cs @@ -0,0 +1,25 @@ +using System; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.Serialization; + +namespace Cursively +{ + /// + /// Raised by when the length of a header + /// exceeds the configured maximum. + /// + [Serializable] + [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")] + public sealed class CursivelyHeaderIsTooLongException : CursivelyDataStreamException + { + internal CursivelyHeaderIsTooLongException(int maxLength) + : base($"CSV stream contains a header that is longer than the configured max length of {maxLength}.") + { + } + + private CursivelyHeaderIsTooLongException(SerializationInfo info, StreamingContext context) + : base(info, context) + { + } + } +} diff --git a/src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs b/src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs new file mode 100644 index 0000000..283068b --- /dev/null +++ b/src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs @@ -0,0 +1,32 @@ +using System; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.Serialization; +using System.Text; + +namespace Cursively +{ + /// + /// Raised by , by default, when the header + /// record contains invalid UTF-8 bytes. + /// + [Serializable] + [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")] + public sealed class CursivelyHeadersAreNotUTF8Exception : CursivelyDataStreamException + { + internal CursivelyHeadersAreNotUTF8Exception(DecoderFallbackException innerException) + : base(innerException.Message, innerException) + { + } + + private CursivelyHeadersAreNotUTF8Exception(SerializationInfo info, StreamingContext context) + : base(info, context) + { + } + + /// + /// Gets the instance that holds the actual decoder + /// state when the current exception was raised. + /// + public DecoderFallbackException InnerDecoderFallbackException => (DecoderFallbackException)InnerException; + } +} diff --git a/src/Cursively/CursivelyMissingDataFieldsException.cs b/src/Cursively/CursivelyMissingDataFieldsException.cs new file mode 100644 index 0000000..03c776d --- /dev/null +++ b/src/Cursively/CursivelyMissingDataFieldsException.cs @@ -0,0 +1,25 @@ +using System; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.Serialization; + +namespace Cursively +{ + /// + /// Raised by , by default, when a data record + /// contains fewer fields than the header record. + /// + [Serializable] + [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")] + public sealed class CursivelyMissingDataFieldsException : CursivelyDataStreamException + { + internal CursivelyMissingDataFieldsException(int headerFieldCount, int dataFieldCount) + : base($"CSV stream contains a non-header record with only {dataFieldCount} field(s), fewer than the {headerFieldCount} field(s) present in the header record.") + { + } + + private CursivelyMissingDataFieldsException(SerializationInfo info, StreamingContext context) + : base(info, context) + { + } + } +} diff --git a/src/Cursively/CursivelyTooManyHeadersException.cs b/src/Cursively/CursivelyTooManyHeadersException.cs new file mode 100644 index 0000000..ad5f876 --- /dev/null +++ b/src/Cursively/CursivelyTooManyHeadersException.cs @@ -0,0 +1,25 @@ +using System; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.Serialization; + +namespace Cursively +{ + /// + /// Raised by when the number of headers + /// exceeds the configured maximum. + /// + [Serializable] + [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")] + public sealed class CursivelyTooManyHeadersException : CursivelyDataStreamException + { + internal CursivelyTooManyHeadersException(int maxHeaderCount) + : base($"CSV stream contains more headers than the configured maximum of {maxHeaderCount}.") + { + } + + private CursivelyTooManyHeadersException(SerializationInfo info, StreamingContext context) + : base(info, context) + { + } + } +} diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs index add2793..4095bd8 100644 --- a/test/Cursively.Tests/CsvTokenizerTests.cs +++ b/test/Cursively.Tests/CsvTokenizerTests.cs @@ -20,15 +20,12 @@ public sealed class CsvTokenizerTests private static readonly byte[] TestDelimiters = { (byte)',', (byte)'\t' }; public static IEnumerable TestCsvFiles => - from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv") + from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv", SearchOption.AllDirectories) select new object[] { filePath }; - public static IEnumerable TestCsvFilesWithChunkLengthsAndDelimiters => - from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv") - let fileName = Path.GetFileNameWithoutExtension(filePath) - from chunkLength in TestChunkLengths - from delimiter in TestDelimiters - select new object[] { fileName, chunkLength, delimiter }; + public static IEnumerable TestCsvFilesWithChunkLengthsAndDelimiters => GetTestCsvFilesWithChunkLengthsAndDelimiters(); + + public static IEnumerable TestValidHeaderedCsvFilesWithChunkLengthsAndDelimiters => GetTestCsvFilesWithChunkLengthsAndDelimiters("with-headers", "valid"); [Theory] [InlineData((byte)0x0A)] @@ -56,10 +53,10 @@ public void NullVisitorShouldBeFine(string filePath) [Theory] [MemberData(nameof(TestCsvFilesWithChunkLengthsAndDelimiters))] - public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength, byte delimiter) + public void CsvTokenizationShouldMatchCsvHelper(string filePath, int chunkLength, byte delimiter) { // arrange - byte[] fileDataTemplate = File.ReadAllBytes(Path.Combine(TestCsvFilesFolderPath, fileName + ".csv")); + byte[] fileDataTemplate = File.ReadAllBytes(filePath); for (int i = 0; i < fileDataTemplate.Length; i++) { if (fileDataTemplate[i] == (byte)',') @@ -68,7 +65,7 @@ public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength } } - int randomSeed = HashCode.Combine(fileName, chunkLength, delimiter); + int randomSeed = HashCode.Combine(filePath, chunkLength, delimiter); foreach (byte[] fileData in VaryLineEndings(fileDataTemplate, randomSeed)) { // act @@ -119,6 +116,62 @@ public void NonstandardQuotedFieldsShouldNotify() Assert.Equal(expectedContentsBeforeNonstandardFields, visitor.ContentsBeforeNonstandardFields); } + [Theory] + [MemberData(nameof(TestValidHeaderedCsvFilesWithChunkLengthsAndDelimiters))] + public void HeaderedCsvTokenizationShouldMatchCsvHelper(string filePath, int chunkLength, byte delimiter) + { + // arrange + byte[] fileDataTemplate = File.ReadAllBytes(filePath); + for (int i = 0; i < fileDataTemplate.Length; i++) + { + if (fileDataTemplate[i] == (byte)',') + { + fileDataTemplate[i] = delimiter; + } + } + + int randomSeed = HashCode.Combine(filePath, chunkLength, delimiter); + foreach (byte[] fileData in VaryLineEndings(fileDataTemplate, randomSeed)) + { + // act + var actual = TokenizeHeaderedCsvFileUsingCursively(fileData, chunkLength, delimiter); + + // assert + var expected = TokenizeCsvFileUsingCsvHelper(fileData, $"{(char)delimiter}"); + Assert.Equal(expected, actual); + } + } + + [Fact] + public void HeaderedCsvTokenizationShouldRejectTooManyDataFieldsByDefault() + { + // arrange + byte[] fileData = File.ReadAllBytes(Path.Combine(TestCsvFilesFolderPath, "with-headers", "invalid", "too-many-data-fields.csv")); + + // act, assert + Assert.Throws(() => TokenizeHeaderedCsvFileUsingCursively(fileData, fileData.Length, (byte)',')); + } + + [Fact] + public void HeaderedCsvTokenizationShouldRejectMissingDataFieldsByDefault() + { + // arrange + byte[] fileData = File.ReadAllBytes(Path.Combine(TestCsvFilesFolderPath, "with-headers", "invalid", "missing-data-fields.csv")); + + // act, assert + Assert.Throws(() => TokenizeHeaderedCsvFileUsingCursively(fileData, fileData.Length, (byte)',')); + } + + [Fact] + public void HeaderedCsvTokenizationShouldRejectInvalidUTF8ByDefault() + { + // arrange + byte[] fileData = File.ReadAllBytes(Path.Combine(TestCsvFilesFolderPath, "with-headers", "invalid", "invalid-utf8-in-header.csv")); + + // act, assert + Assert.Throws(() => TokenizeHeaderedCsvFileUsingCursively(fileData, fileData.Length, (byte)',')); + } + private static List TokenizeCsvFileUsingCursively(ReadOnlySpan fileData, int chunkLength, byte delimiter) { var tokenizer = new CsvTokenizer(delimiter); @@ -134,6 +187,21 @@ private static List TokenizeCsvFileUsingCursively(ReadOnlySpan f return visitor.Records; } + private static List TokenizeHeaderedCsvFileUsingCursively(ReadOnlySpan fileData, int chunkLength, byte delimiter) + { + var tokenizer = new CsvTokenizer(delimiter); + var visitor = new HeaderedStringBufferingVisitor(fileData.Length); + while (fileData.Length >= chunkLength) + { + tokenizer.ProcessNextChunk(fileData.Slice(0, chunkLength), visitor); + fileData = fileData.Slice(chunkLength); + } + + tokenizer.ProcessNextChunk(fileData, visitor); + tokenizer.ProcessEndOfStream(visitor); + return visitor.Records; + } + private static IEnumerable TokenizeCsvFileUsingCsvHelper(byte[] csvData, string delimiter) { using (var stream = new MemoryStream(csvData, false)) @@ -197,6 +265,12 @@ private static byte[][] VaryLineEndings(ReadOnlySpan fileData, int randomS return Array.ConvertAll(resultLists, lst => lst.ToArray()); } + private static IEnumerable GetTestCsvFilesWithChunkLengthsAndDelimiters(params string[] pathParts) => + from filePath in Directory.EnumerateFiles(Path.Combine(TestCsvFilesFolderPath, Path.Combine(pathParts)), "*.csv", SearchOption.AllDirectories) + from chunkLength in TestChunkLengths + from delimiter in TestDelimiters + select new object[] { filePath, chunkLength, delimiter }; + private sealed class StringBufferingVisitor : CsvReaderVisitorBase { private static readonly UTF8Encoding TheEncoding = new UTF8Encoding(false, false); @@ -278,5 +352,51 @@ private void VisitFieldContents(ReadOnlySpan chunk, bool flush) } } } + + private sealed class HeaderedStringBufferingVisitor : CsvReaderVisitorWithUTF8HeadersBase + { + private static readonly UTF8Encoding TheEncoding = new UTF8Encoding(false, false); + + private readonly List _fields = new List(); + + private readonly byte[] _cutBuffer; + + private int _cutBufferConsumed; + + public HeaderedStringBufferingVisitor(int fileLength) => _cutBuffer = new byte[fileLength]; + + public List Records { get; } = new List(); + + protected override void VisitEndOfHeaderRecord() + { + Records.Insert(0, Headers.ToArray()); + } + + protected override void VisitEndOfDataRecord() + { + Records.Add(_fields.ToArray()); + _fields.Clear(); + } + + protected override void VisitPartialDataFieldContents(ReadOnlySpan chunk) => CopyToCutBuffer(chunk); + + protected override void VisitEndOfDataField(ReadOnlySpan chunk) + { + if (_cutBufferConsumed != 0) + { + CopyToCutBuffer(chunk); + chunk = new ReadOnlySpan(_cutBuffer, 0, _cutBufferConsumed); + } + + _fields.Add(TheEncoding.GetString(chunk)); + _cutBufferConsumed = 0; + } + + private void CopyToCutBuffer(ReadOnlySpan chunk) + { + chunk.CopyTo(new Span(_cutBuffer, _cutBufferConsumed, chunk.Length)); + _cutBufferConsumed += chunk.Length; + } + } } } diff --git a/test/Cursively.Tests/Cursively.Tests.csproj b/test/Cursively.Tests/Cursively.Tests.csproj index ddb7268..a969ff3 100644 --- a/test/Cursively.Tests/Cursively.Tests.csproj +++ b/test/Cursively.Tests/Cursively.Tests.csproj @@ -17,7 +17,7 @@ - + diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv new file mode 100644 index 0000000..7ff9b39 --- /dev/null +++ b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv @@ -0,0 +1,2 @@ +é, +a, \ No newline at end of file diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv new file mode 100644 index 0000000..7cc53b4 --- /dev/null +++ b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv @@ -0,0 +1,3 @@ +a,b,c +1,2,3 +1,2 \ No newline at end of file diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv new file mode 100644 index 0000000..636ad5f --- /dev/null +++ b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv @@ -0,0 +1,2 @@ +"a","b","c" +"a","b","c", \ No newline at end of file diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv new file mode 100644 index 0000000..ee4f68d --- /dev/null +++ b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv @@ -0,0 +1,2 @@ +a, +é, \ No newline at end of file diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv new file mode 100644 index 0000000..b4819c6 --- /dev/null +++ b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv @@ -0,0 +1,3 @@ +A,B,C +1,2,3 +do,re,mi \ No newline at end of file From e3f88858866f58500a648de4611748c12c56f635 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sat, 15 Jun 2019 14:51:16 -0400 Subject: [PATCH 20/22] Rename to "ProcessFile". "Entire" was redundant. --- src/Cursively/Csv.cs | 2 +- test/Cursively.Tests/CsvTokenizerTests.cs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Cursively/Csv.cs b/src/Cursively/Csv.cs index 917992a..eca7491 100644 --- a/src/Cursively/Csv.cs +++ b/src/Cursively/Csv.cs @@ -275,7 +275,7 @@ public static async ValueTask ProcessStreamAsync(Stream csvStream, CsvReaderVisi /// /// See . /// - public static unsafe void ProcessEntireFile(string csvFilePath, CsvReaderVisitorBase visitor) + public static unsafe void ProcessFile(string csvFilePath, CsvReaderVisitorBase visitor) { using (var fl = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan)) { diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs index 4095bd8..77f951e 100644 --- a/test/Cursively.Tests/CsvTokenizerTests.cs +++ b/test/Cursively.Tests/CsvTokenizerTests.cs @@ -85,7 +85,7 @@ public void MemoryMappedCsvShouldMatchCsvHelper(string filePath) var visitor = new StringBufferingVisitor(checked((int)new FileInfo(filePath).Length)); // act - Csv.ProcessEntireFile(filePath, visitor); + Csv.ProcessFile(filePath, visitor); var actual = visitor.Records; // assert @@ -101,7 +101,7 @@ public void NonstandardQuotedFieldsShouldNotify() var visitor = new NonstandardFieldVisitor(checked((int)new FileInfo(csvFilePath).Length)); // act - Csv.ProcessEntireFile(csvFilePath, visitor); + Csv.ProcessFile(csvFilePath, visitor); // assert string[] expectedContentsBeforeNonstandardFields = From 67d558d69965d9dc4bf3119ee2dd297fe8e5fe24 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sat, 15 Jun 2019 14:51:36 -0400 Subject: [PATCH 21/22] Update README.md --- README.md | 101 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 7b8411d..ea255e9 100644 --- a/README.md +++ b/README.md @@ -9,43 +9,10 @@ A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading l Documentation is currently being published as [GitHub Pages](https://airbreather.github.io/Cursively/index.html). ## Usage -1. Create a subclass of `CsvReaderVisitorBase` with your own logic. -1. To read a CSV file: - - Create a new instance of your visitor. - - Create a new instance of `CsvTokenizer`. - - Call `CsvTokenizer.ProcessNextChunk` for each chunk of the file. - - Call `CsvTokenizer.ProcessEndOfStream` after the last chunk of the file. - -## Example -This demonstrates using Cursively to write the details of a particular UTF-8 encoded file to the console. +Create a subclass of `CsvReaderVisitorBase` (or one of its own built-in subclasses) with your own logic for processing the individual elements in order. Then, you have some options. +### Example Visitor ```csharp -public static void ProcessCsvFile(string csvFilePath) -{ - var myVisitor = new MyVisitor(maxFieldLength: 1000); - var tokenizer = new CsvTokenizer(); - using (var file = File.OpenRead(csvFilePath)) - { - Console.WriteLine($"Started reading '{csvFilePath}'."); - Span fileReadBuffer = new byte[4096]; - while (true) - { - int count = file.Read(fileReadBuffer); - if (count == 0) - { - break; - } - - var chunk = fileReadBuffer.Slice(0, count); - tokenizer.ProcessNextChunk(chunk, myVisitor); - } - - tokenizer.ProcessEndOfStream(myVisitor); - } - - Console.WriteLine($"Finished reading '{csvFilePath}'."); -} - public sealed class MyVisitor : CsvReaderVisitorBase { private readonly Decoder _utf8Decoder = Encoding.UTF8.GetDecoder(); @@ -88,3 +55,67 @@ public sealed class MyVisitor : CsvReaderVisitorBase } } ``` + +### Fastest +All of the other methods of processing the data are built on top of this, so it gives you the most control: +1. Create a new instance of your visitor. +1. Create a new instance of `CsvTokenizer`. +1. Call `CsvTokenizer.ProcessNextChunk` for each chunk of the file. +1. Call `CsvTokenizer.ProcessEndOfStream` after the last chunk of the file. + +Example: +```csharp +public static void ProcessCsvFile(string csvFilePath) +{ + var myVisitor = new MyVisitor(maxFieldLength: 1000); + var tokenizer = new CsvTokenizer(); + using (var file = File.OpenRead(csvFilePath)) + { + Console.WriteLine($"Started reading '{csvFilePath}'."); + Span fileReadBuffer = new byte[4096]; + while (true) + { + int count = file.Read(fileReadBuffer); + if (count == 0) + { + break; + } + + var chunk = fileReadBuffer.Slice(0, count); + tokenizer.ProcessNextChunk(chunk, myVisitor); + } + + tokenizer.ProcessEndOfStream(myVisitor); + } + + Console.WriteLine($"Finished reading '{csvFilePath}'."); +} +``` + +### Simpler +1. Create a new instance of your visitor. +1. Call one of the `Csv.Process*` methods, passing in whatever format your data is in along with your visitor. + +Examples: +```csharp +public static void ProcessCsvFile(string csvFilePath) +{ + Console.WriteLine($"Started reading '{csvFilePath}'."); + Csv.ProcessFile(csvFilePath, new MyVisitor(maxFieldLength: 1000)); + Console.WriteLine($"Finished reading '{csvFilePath}'."); +} + +public static void ProcessCsvStream(Stream csvStream) +{ + Console.WriteLine($"Started reading '{csvFilePath}'."); + Csv.ProcessStream(csvStream, new MyVisitor(maxFieldLength: 1000)); + Console.WriteLine($"Finished reading '{csvFilePath}'."); +} + +public static async ValueTask ProcessCsvStreamAsync(Stream csvStream, IProgress progress = null, CancellationToken cancellationToken = default) +{ + Console.WriteLine($"Started reading '{csvFilePath}'."); + await Csv.ProcessStreamAsync(csvStream, new MyVisitor(maxFieldLength: 1000), progress, cancellationToken); + Console.WriteLine($"Finished reading '{csvFilePath}'."); +} +``` From 3de2d12e2f98c57d8696f3a2c4297ce221b96fe9 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sat, 15 Jun 2019 15:27:11 -0400 Subject: [PATCH 22/22] doc updates --- doc/benchmark-1.1.0.md | 79 ++++++++++++++++++++++++++++++++++++++++++ doc/release-notes.md | 8 +++++ doc/toc.yml | 2 +- 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 doc/benchmark-1.1.0.md diff --git a/doc/benchmark-1.1.0.md b/doc/benchmark-1.1.0.md new file mode 100644 index 0000000..a588ec9 --- /dev/null +++ b/doc/benchmark-1.1.0.md @@ -0,0 +1,79 @@ +This benchmark tests the simple act of counting how many records are in a CSV file. It's not a simple count of how many lines are in the text file: line breaks within quoted fields must be treated as data, and multiple line breaks in a row must be treated as one, since each record must have at least one field. Therefore, assuming correct implementations, this benchmark should test the raw CSV processing speed. + +Cursively eliminates a ton of overhead found in libraries such as CsvHelper by restricting the allowed input encodings and using the visitor pattern as its only means of output. Cursively can scan through the original bytes of the input to do its work, and it can give slices of the input data directly to the consumer without having to copy or allocate. + +Therefore, these benchmarks are somewhat biased in favor of Cursively, as CsvHelper relies on external code to transform the data to UTF-16. This isn't as unfair as that makes it sound: the overwhelming majority of input files are probably UTF-8 anyway (or a compatible SBCS), so this transformation is something that practically every user will experience. + +- Input files can be found here: https://github.com/airbreather/Cursively/tree/v1.1.0/test/Cursively.Benchmark/large-csv-files.zip +- Benchmark source code is this: https://github.com/airbreather/Cursively/tree/v1.1.0/test/Cursively.Benchmark + +Raw BenchmarkDotNet output is at the bottom, but here are some numbers derived from it. The data was fully loaded in main memory when running these tests. This summary also does not indicate anything about the GC pressure: + +|CSV File|Runtime|Library|Throughput| +|-|-|-|-| +|100 records / 10,000 tiny fields each|.NET 4.7.2|Cursively|336.06 MiB/s| +|100 records / 10,000 tiny fields each|.NET 4.7.2|CsvHelper|22.04 MiB/s| +|100 records / 10,000 tiny fields each|.NET Core 2.2.5|Cursively|487.59 MiB/s| +|100 records / 10,000 tiny fields each|.NET Core 2.2.5|CsvHelper|27.31 MiB/s| +|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|Cursively|178.23 MiB/s| +|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|CsvHelper|24.33 MiB/s| +|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|Cursively|303.67 MiB/s| +|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|CsvHelper|29.20 MiB/s| +|10,000 records / 1,000 empty fields each|.NET 4.7.2|Cursively|176.71 MiB/s| +|10,000 records / 1,000 empty fields each|.NET 4.7.2|CsvHelper|14.45 MiB/s| +|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|Cursively|306.49 MiB/s| +|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|CsvHelper|15.15 MiB/s| +|Mock data from Mockaroo|.NET 4.7.2|Cursively|2,711.41 MiB/s| +|Mock data from Mockaroo|.NET 4.7.2|CsvHelper|72.50 MiB/s| +|Mock data from Mockaroo|.NET Core 2.2.5|Cursively|3,755.55 MiB/s| +|Mock data from Mockaroo|.NET Core 2.2.5|CsvHelper|75.05 MiB/s| +|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET 4.7.2|Cursively|390.75 MiB/s| +|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET 4.7.2|CsvHelper|40.15 MiB/s| +|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET Core 2.2.5|Cursively|607.81 MiB/s| +|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET Core 2.2.5|CsvHelper|39.90 MiB/s| + +Raw BenchmarkDotNet output: + +``` ini + +BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362 +Intel Core i7-6850K CPU 3.60GHz (Skylake), 1 CPU, 12 logical and 6 physical cores +.NET Core SDK=3.0.100-preview6-012264 + [Host] : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT + Job-DDQSKN : .NET Framework 4.7.2 (CLR 4.0.30319.42000), 64bit RyuJIT-v4.8.3801.0 + Job-RTHUVO : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT + +Server=True + +``` +| Method | Runtime | csvFile | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | +|------------------------ |-------- |--------------------- |-------------:|----------:|----------:|------:|--------:|------------:|-----------:|---------:|-------------:| +| CountRowsUsingCursively | Clr | 100-huge-records | 8.231 ms | 0.0839 ms | 0.0743 ms | 1.00 | 0.00 | - | - | - | 128 B | +| CountRowsUsingCsvHelper | Clr | 100-huge-records | 125.493 ms | 1.1717 ms | 1.0387 ms | 15.25 | 0.21 | 17250.0000 | 6750.0000 | 750.0000 | 110560856 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Core | 100-huge-records | 5.673 ms | 0.0073 ms | 0.0068 ms | 1.00 | 0.00 | - | - | - | 48 B | +| CountRowsUsingCsvHelper | Core | 100-huge-records | 101.277 ms | 0.2342 ms | 0.2190 ms | 17.85 | 0.05 | 400.0000 | 200.0000 | - | 110256320 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Clr | 100-h(...)uoted [23] | 26.222 ms | 0.0260 ms | 0.0231 ms | 1.00 | 0.00 | - | - | - | 256 B | +| CountRowsUsingCsvHelper | Clr | 100-h(...)uoted [23] | 192.090 ms | 0.9954 ms | 0.9311 ms | 7.33 | 0.04 | 25000.0000 | 11000.0000 | 666.6667 | 154027456 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Core | 100-h(...)uoted [23] | 15.390 ms | 0.0450 ms | 0.0399 ms | 1.00 | 0.00 | - | - | - | 48 B | +| CountRowsUsingCsvHelper | Core | 100-h(...)uoted [23] | 160.043 ms | 0.4644 ms | 0.4344 ms | 10.40 | 0.04 | 333.3333 | - | - | 153579848 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Clr | 10k-empty-records | 54.007 ms | 0.3061 ms | 0.2556 ms | 1.00 | 0.00 | - | - | - | 819 B | +| CountRowsUsingCsvHelper | Clr | 10k-empty-records | 661.502 ms | 3.1801 ms | 2.9747 ms | 12.24 | 0.08 | 66000.0000 | 2000.0000 | - | 422077104 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Core | 10k-empty-records | 31.178 ms | 0.2056 ms | 0.1924 ms | 1.00 | 0.00 | - | - | - | 48 B | +| CountRowsUsingCsvHelper | Core | 10k-empty-records | 630.683 ms | 1.2503 ms | 1.1084 ms | 20.23 | 0.13 | 2000.0000 | - | - | 420832856 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Clr | mocked | 4.478 ms | 0.0071 ms | 0.0067 ms | 1.00 | 0.00 | - | - | - | 64 B | +| CountRowsUsingCsvHelper | Clr | mocked | 167.477 ms | 0.3523 ms | 0.3296 ms | 37.40 | 0.08 | 18333.3333 | 333.3333 | - | 116105312 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Core | mocked | 3.233 ms | 0.0063 ms | 0.0059 ms | 1.00 | 0.00 | - | - | - | 48 B | +| CountRowsUsingCsvHelper | Core | mocked | 161.791 ms | 0.3473 ms | 0.3249 ms | 50.05 | 0.15 | 333.3333 | - | - | 115757736 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Clr | worldcitiespop | 369.738 ms | 0.6855 ms | 0.6077 ms | 1.00 | 0.00 | - | - | - | 8192 B | +| CountRowsUsingCsvHelper | Clr | worldcitiespop | 3,598.421 ms | 2.0735 ms | 1.9396 ms | 9.73 | 0.02 | 493000.0000 | 7000.0000 | - | 3105811440 B | +| | | | | | | | | | | | | +| CountRowsUsingCursively | Core | worldcitiespop | 237.695 ms | 0.2994 ms | 0.2800 ms | 1.00 | 0.00 | - | - | - | 48 B | +| CountRowsUsingCsvHelper | Core | worldcitiespop | 3,620.550 ms | 3.1766 ms | 2.8160 ms | 15.23 | 0.02 | 15000.0000 | - | - | 3096694312 B | diff --git a/doc/release-notes.md b/doc/release-notes.md index 8c1dc2d..1096e59 100644 --- a/doc/release-notes.md +++ b/doc/release-notes.md @@ -1,4 +1,12 @@ # Cursively Release Notes +## [1.1.0](https://github.com/airbreather/Cursively/milestone/1) +- Several further performance optimizations. Most significantly, inlining and tuning a critical `ReadOnlySpan` extension method. + - In some cases, this increased throughput by a factor of 3. +- Added hooks for visitor implementations to detect situations where the stream does not conform to the RFC 4180 rules for quoted fields ([#4](https://github.com/airbreather/Cursively/issues/4)) +- Added support to customize the field delimiter byte ([#11](https://github.com/airbreather/Cursively/issues/11)) +- Added helpers to avoid having to use `CsvTokenizer` directly in most cases ([#9](https://github.com/airbreather/Cursively/issues/9), [#10](https://github.com/airbreather/Cursively/issues/10)) +- Added an intermediate abstract visitor class that handles UTF-8 encoded headers ([#5](https://github.com/airbreather/Cursively/issues/5)) + ## 1.0.0 - Initial release. diff --git a/doc/toc.yml b/doc/toc.yml index 0e11d66..c94d485 100644 --- a/doc/toc.yml +++ b/doc/toc.yml @@ -3,7 +3,7 @@ - name: API Documentation href: obj/api/ - name: Benchmark - href: benchmark-1.0.0.md + href: benchmark-1.1.0.md - name: Release Notes href: release-notes.md - name: NuGet Package