From 6948d07405f54d21456a50d5ca7174e68e036f42 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Sun, 26 May 2019 12:46:00 -0400
Subject: [PATCH 01/22] Fix an off-by-one in the readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index ba574db..b1382b8 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ public sealed class MyVisitor : CsvReaderVisitorBase
     private void VisitFieldContents(ReadOnlySpan<byte> chunk, bool flush)
     {
         int charCount = _utf8Decoder.GetCharCount(chunk, flush);
-        if (charCount + _bufferConsumed < _buffer.Length)
+        if (charCount + _bufferConsumed <= _buffer.Length)
         {
             _utf8Decoder.GetChars(chunk, new Span<char>(_buffer, _bufferConsumed, charCount), flush);
             _bufferConsumed += charCount;

From 18a757a7c737363abed94246c8016ee5bede4b7e Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Sun, 26 May 2019 13:56:36 -0400
Subject: [PATCH 02/22] Add memory-mapped file helper. Resolves #9

---
 src/Cursively/Csv.cs                      | 64 +++++++++++++++++++++++
 src/Cursively/Cursively.csproj            |  1 +
 test/Cursively.Benchmark/Program.cs       | 22 ++++----
 test/Cursively.Tests/CsvTokenizerTests.cs | 24 ++++++++-
 4 files changed, 96 insertions(+), 15 deletions(-)
 create mode 100644 src/Cursively/Csv.cs

diff --git a/src/Cursively/Csv.cs b/src/Cursively/Csv.cs
new file mode 100644
index 0000000..e23740b
--- /dev/null
+++ b/src/Cursively/Csv.cs
@@ -0,0 +1,64 @@
+﻿using System;
+using System.IO;
+using System.IO.MemoryMappedFiles;
+
+namespace Cursively
+{
+    /// <summary>
+    /// Contains helper methods for CSV processing.
+    /// </summary>
+    public static class Csv
+    {
+        /// <summary>
+        /// Describes the contents of a CSV file to the given instance of the
+        /// <see cref="CsvReaderVisitorBase"/> class, using memory-mapped files behind the scenes.
+        /// </summary>
+        /// <param name="csvFilePath">
+        /// The path to the CSV file to describe.
+        /// </param>
+        /// <param name="visitor">
+        /// The <see cref="CsvReaderVisitorBase"/> instance to describe the file to.
+        /// </param>
+        public static unsafe void ProcessMemoryMappedFile(string csvFilePath, CsvReaderVisitorBase visitor)
+        {
+            using (var fl = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read))
+            {
+                long length = fl.Length;
+                if (length == 0)
+                {
+                    return;
+                }
+
+                var tokenizer = new CsvTokenizer();
+                using (var memoryMappedFile = MemoryMappedFile.CreateFromFile(fl, null, 0, MemoryMappedFileAccess.Read, HandleInheritability.None, leaveOpen: true))
+                using (var accessor = memoryMappedFile.CreateViewAccessor(0, 0, MemoryMappedFileAccess.Read))
+                {
+                    var handle = accessor.SafeMemoryMappedViewHandle;
+                    byte* ptr = null;
+                    try
+                    {
+                        handle.AcquirePointer(ref ptr);
+                        for (long rem = length; rem > 0; rem -= int.MaxValue)
+                        {
+                            int currentChunkLength = rem < int.MaxValue
+                                ? unchecked((int)rem)
+                                : int.MaxValue;
+
+                            var span = new ReadOnlySpan<byte>(ptr, currentChunkLength);
+                            tokenizer.ProcessNextChunk(span, visitor);
+                        }
+
+                        tokenizer.ProcessEndOfStream(visitor);
+                    }
+                    finally
+                    {
+                        if (ptr != null)
+                        {
+                            handle.ReleasePointer();
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/src/Cursively/Cursively.csproj b/src/Cursively/Cursively.csproj
index a702484..6f5c7ae 100644
--- a/src/Cursively/Cursively.csproj
+++ b/src/Cursively/Cursively.csproj
@@ -2,6 +2,7 @@
 
   <PropertyGroup>
     <TargetFramework>netstandard2.0</TargetFramework>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs
index 38d3d9c..30a987e 100644
--- a/test/Cursively.Benchmark/Program.cs
+++ b/test/Cursively.Benchmark/Program.cs
@@ -11,32 +11,28 @@
 
 namespace Cursively.Benchmark
 {
-    [ClrJob]
-    [CoreJob]
-    [CoreRtJob]
-    [GcServer(true)]
-    [MemoryDiagnoser]
+    [ClrJob, CoreJob, GcServer(true), MemoryDiagnoser]
     public class Program
     {
         public static CsvFile[] CsvFiles => GetCsvFiles();
 
         [Benchmark(Baseline = true)]
         [ArgumentsSource(nameof(CsvFiles))]
-        public void NopUsingCursively(CsvFile csvFile)
+        public long CountRowsUsingCursivelyByteArray(CsvFile csvFile)
         {
+            var visitor = new RowCountingVisitor();
             var tokenizer = new CsvTokenizer();
-            tokenizer.ProcessNextChunk(csvFile.FileData, null);
-            tokenizer.ProcessEndOfStream(null);
+            tokenizer.ProcessNextChunk(csvFile.FileData, visitor);
+            tokenizer.ProcessEndOfStream(visitor);
+            return visitor.RowCount;
         }
 
         [Benchmark]
         [ArgumentsSource(nameof(CsvFiles))]
-        public long CountRowsUsingCursively(CsvFile csvFile)
+        public long CountRowsUsingCursivelyWithMemoryMappedFile(CsvFile csvFile)
         {
             var visitor = new RowCountingVisitor();
-            var tokenizer = new CsvTokenizer();
-            tokenizer.ProcessNextChunk(csvFile.FileData, visitor);
-            tokenizer.ProcessEndOfStream(visitor);
+            Csv.ProcessMemoryMappedFile(csvFile.FullPath, visitor);
             return visitor.RowCount;
         }
 
@@ -63,7 +59,7 @@ private static int Main()
             var prog = new Program();
             foreach (var csvFile in CsvFiles)
             {
-                if (prog.CountRowsUsingCursively(csvFile) != prog.CountRowsUsingCsvHelper(csvFile))
+                if (prog.CountRowsUsingCursivelyByteArray(csvFile) != prog.CountRowsUsingCsvHelper(csvFile))
                 {
                     Console.Error.WriteLine($"Failed on {csvFile}.");
                     return 1;
diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs
index 0c18b40..d1d2849 100644
--- a/test/Cursively.Tests/CsvTokenizerTests.cs
+++ b/test/Cursively.Tests/CsvTokenizerTests.cs
@@ -18,13 +18,17 @@ public sealed class CsvTokenizerTests
         private static readonly int[] TestChunkLengths = { 1, 2, 3, 5, 8, 13, 21, 34 };
 
         public static IEnumerable<object[]> TestCsvFiles =>
+            from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv")
+            select new object[] { filePath };
+
+        public static IEnumerable<object[]> TestCsvFilesWithChunkLengths =>
             from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv")
             let fileName = Path.GetFileNameWithoutExtension(filePath)
             from chunkLength in TestChunkLengths
             select new object[] { fileName, chunkLength };
 
         [Theory]
-        [MemberData(nameof(TestCsvFiles))]
+        [MemberData(nameof(TestCsvFilesWithChunkLengths))]
         public void NullVisitorShouldBeFine(string fileName, int chunkLength)
         {
             // arrange
@@ -46,7 +50,7 @@ public void NullVisitorShouldBeFine(string fileName, int chunkLength)
         }
 
         [Theory]
-        [MemberData(nameof(TestCsvFiles))]
+        [MemberData(nameof(TestCsvFilesWithChunkLengths))]
         public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength)
         {
             // arrange
@@ -64,6 +68,22 @@ public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength
             }
         }
 
+        [Theory]
+        [MemberData(nameof(TestCsvFiles))]
+        public void MemoryMappedCsvShouldMatchCsvHelper(string filePath)
+        {
+            // arrange
+            var visitor = new StringBufferingVisitor(checked((int)new FileInfo(filePath).Length));
+
+            // act
+            Csv.ProcessMemoryMappedFile(filePath, visitor);
+            var actual = visitor.Lines;
+
+            // assert
+            var expected = TokenizeCsvFileUsingCsvHelper(File.ReadAllBytes(filePath));
+            Assert.Equal(expected, actual);
+        }
+
         private static List<string[]> TokenizeCsvFileUsingMine(ReadOnlySpan<byte> fileData, int chunkLength)
         {
             var tokenizer = new CsvTokenizer();

From a20b068d9c27457749170777e1a180832bf8ba61 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Mon, 27 May 2019 10:14:50 -0400
Subject: [PATCH 03/22] doc updates

---
 doc/benchmark-1.0.0.md | 71 ++++++++++++++++++++++++++++++++++++++++++
 doc/toc.yml            |  2 ++
 2 files changed, 73 insertions(+)
 create mode 100644 doc/benchmark-1.0.0.md

diff --git a/doc/benchmark-1.0.0.md b/doc/benchmark-1.0.0.md
new file mode 100644
index 0000000..6ba25b1
--- /dev/null
+++ b/doc/benchmark-1.0.0.md
@@ -0,0 +1,71 @@
+This benchmark tests the simple act of counting how many records are in a CSV file.  It's not a simple count of how many lines are in the text file: line breaks within quoted fields must be treated as data, and multiple line breaks in a row must be treated as one, since each record must have at least one field.  Therefore, assuming correct implementations, this benchmark should test the raw CSV processing speed.
+
+Cursively eliminates a ton of overhead found in libraries such as CsvHelper by restricting the allowed input encodings and using the visitor pattern as its only means of output.  Cursively can scan through the original bytes of the input to do its work, and it can give slices of the input data directly to the consumer without having to copy or allocate.
+
+Therefore, these benchmarks are somewhat biased in favor of Cursively, as CsvHelper relies on external code to transform the data to UTF-16.  This isn't as unfair as that makes it sound: the overwhelming majority of input files are probably UTF-8 anyway (or a compatible SBCS), so this transformation is something that practically every user will experience.
+
+- Input files can be found here: https://github.com/airbreather/Cursively/tree/v1.0.0/test/Cursively.Benchmark/large-csv-files
+- Benchmark source code is a slightly edited* version of this: https://github.com/airbreather/Cursively/tree/v1.0.0/test/Cursively.Benchmark
+    - *edited only to remove `CoreRtJob` and the more-or-less redundant `NopUsingCursively`
+
+Raw BenchmarkDotNet output is at the bottom, but here are some numbers derived from it.  The data was fully loaded in main memory when running these tests.  This summary also does not indicate anything about the GC pressure:
+
+|CSV File|Runtime|Library|Throughput|
+|-|-|-|-|
+|100 records / 10,000 tiny fields each|.NET 4.7.2|Cursively|99.81 MiB/s|
+|100 records / 10,000 tiny fields each|.NET 4.7.2|CsvHelper|22.60 MiB/s|
+|100 records / 10,000 tiny fields each|.NET Core 2.2.5|Cursively|126.1 MiB/s|
+|100 records / 10,000 tiny fields each|.NET Core 2.2.5|CsvHelper|25.32 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|Cursively|118.5 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|CsvHelper|25.05 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|Cursively|187.0 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|CsvHelper|27.96 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET 4.7.2|Cursively|64.15 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET 4.7.2|CsvHelper|15.57 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|Cursively|112.7 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|CsvHelper|14.84 MiB/s|
+|Mock data from Mockaroo|.NET 4.7.2|Cursively|1.637 GiB/s|
+|Mock data from Mockaroo|.NET 4.7.2|CsvHelper|74.81 MiB/s|
+|Mock data from Mockaroo|.NET Core 2.2.5|Cursively|1.893 GiB/s|
+|Mock data from Mockaroo|.NET Core 2.2.5|CsvHelper|66.86 MiB/s|
+
+Raw BenchmarkDotNet output:
+
+``` ini
+
+BenchmarkDotNet=v0.11.5, OS=Windows 10.0.17134.765 (1803/April2018Update/Redstone4)
+Intel Core i7-6850K CPU 3.60GHz (Skylake), 1 CPU, 12 logical and 6 physical cores
+Frequency=3515622 Hz, Resolution=284.4447 ns, Timer=TSC
+.NET Core SDK=2.2.300
+  [Host]     : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT
+  Job-ASLTDW : .NET Framework 4.7.2 (CLR 4.0.30319.42000), 64bit RyuJIT-v4.7.3416.0
+  Job-RICADF : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT
+
+Server=True  
+
+```
+|                  Method | Runtime |              csvFile |       Mean |     Error |    StdDev | Ratio | RatioSD |      Gen 0 |     Gen 1 |    Gen 2 |   Allocated |
+|------------------------ |-------- |--------------------- |-----------:|----------:|----------:|------:|--------:|-----------:|----------:|---------:|------------:|
+| CountRowsUsingCursively |     Clr |     100-huge-records |  27.714 ms | 0.0126 ms | 0.0105 ms |  1.00 |    0.00 |          - |         - |        - |       256 B |
+| CountRowsUsingCsvHelper |     Clr |     100-huge-records | 122.397 ms | 0.1685 ms | 0.1494 ms |  4.42 |    0.01 | 17250.0000 | 6250.0000 | 750.0000 | 110257334 B |
+|                         |         |                      |            |           |           |       |         |            |           |          |             |
+| CountRowsUsingCursively |    Core |     100-huge-records |  21.932 ms | 0.0254 ms | 0.0226 ms |  1.00 |    0.00 |          - |         - |        - |        56 B |
+| CountRowsUsingCsvHelper |    Core |     100-huge-records | 109.261 ms | 0.3319 ms | 0.3104 ms |  4.98 |    0.02 |   400.0000 |  200.0000 |        - | 110256320 B |
+|                         |         |                      |            |           |           |       |         |            |           |          |             |
+| CountRowsUsingCursively |     Clr | 100-h(...)uoted [23] |  39.453 ms | 0.0974 ms | 0.0864 ms |  1.00 |    0.00 |          - |         - |        - |       683 B |
+| CountRowsUsingCsvHelper |     Clr | 100-h(...)uoted [23] | 186.572 ms | 0.4682 ms | 0.4380 ms |  4.73 |    0.01 | 24666.6667 | 9666.6667 | 666.6667 | 153595995 B |
+|                         |         |                      |            |           |           |       |         |            |           |          |             |
+| CountRowsUsingCursively |    Core | 100-h(...)uoted [23] |  24.995 ms | 0.0160 ms | 0.0142 ms |  1.00 |    0.00 |          - |         - |        - |        56 B |
+| CountRowsUsingCsvHelper |    Core | 100-h(...)uoted [23] | 167.160 ms | 0.3437 ms | 0.3215 ms |  6.69 |    0.02 |   333.3333 |         - |        - | 153579848 B |
+|                         |         |                      |            |           |           |       |         |            |           |          |             |
+| CountRowsUsingCursively |     Clr |    10k-empty-records | 148.952 ms | 0.2502 ms | 0.2340 ms |  1.00 |    0.00 |          - |         - |        - |      2048 B |
+| CountRowsUsingCsvHelper |     Clr |    10k-empty-records | 613.718 ms | 0.8869 ms | 0.7862 ms |  4.12 |    0.01 | 66000.0000 | 2000.0000 |        - | 420838944 B |
+|                         |         |                      |            |           |           |       |         |            |           |          |             |
+| CountRowsUsingCursively |    Core |    10k-empty-records |  84.801 ms | 0.1079 ms | 0.1009 ms |  1.00 |    0.00 |          - |         - |        - |        56 B |
+| CountRowsUsingCsvHelper |    Core |    10k-empty-records | 644.051 ms | 2.8782 ms | 2.5515 ms |  7.60 |    0.03 |  2000.0000 |         - |        - | 420832856 B |
+|                         |         |                      |            |           |           |       |         |            |           |          |             |
+| CountRowsUsingCursively |     Clr |               mocked |   7.242 ms | 0.0233 ms | 0.0207 ms |  1.00 |    0.00 |          - |         - |        - |        64 B |
+| CountRowsUsingCsvHelper |     Clr |               mocked | 162.298 ms | 0.2958 ms | 0.2622 ms | 22.41 |    0.08 | 18000.0000 |  333.3333 |        - | 115764389 B |
+|                         |         |                      |            |           |           |       |         |            |           |          |             |
+| CountRowsUsingCursively |    Core |               mocked |   6.264 ms | 0.0115 ms | 0.0107 ms |  1.00 |    0.00 |          - |         - |        - |        56 B |
+| CountRowsUsingCsvHelper |    Core |               mocked | 181.592 ms | 0.3413 ms | 0.3193 ms | 28.99 |    0.09 |   333.3333 |         - |        - | 115757736 B |
diff --git a/doc/toc.yml b/doc/toc.yml
index aa4ba7c..0e11d66 100644
--- a/doc/toc.yml
+++ b/doc/toc.yml
@@ -2,6 +2,8 @@
   href: index.md
 - name: API Documentation
   href: obj/api/
+- name: Benchmark
+  href: benchmark-1.0.0.md
 - name: Release Notes
   href: release-notes.md
 - name: NuGet Package

From 69d65d83aef64e666faccf734f2774735bbd016e Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Tue, 28 May 2019 09:44:01 -0400
Subject: [PATCH 04/22] Support custom delimiters. Resolves #11

---
 src/Cursively/CsvTokenizer.cs             | 215 +++++++++++++---------
 test/Cursively.Benchmark/Program.cs       |   4 +-
 test/Cursively.Tests/CsvTokenizerTests.cs |  56 +++---
 3 files changed, 159 insertions(+), 116 deletions(-)

diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs
index 8dd039e..1a44af0 100644
--- a/src/Cursively/CsvTokenizer.cs
+++ b/src/Cursively/CsvTokenizer.cs
@@ -142,20 +142,50 @@ namespace Cursively
     /// </remarks>
     public class CsvTokenizer
     {
-        private const byte COMMA = (byte)',';
-
         private const byte CR = (byte)'\r';
 
         private const byte LF = (byte)'\n';
 
         private const byte QUOTE = (byte)'"';
 
-        private static readonly byte[] AllStopBytes = { COMMA, QUOTE, CR, LF };
-
-        private static readonly byte[] AllStopBytesExceptQuote = { COMMA, CR, LF };
+        private readonly byte _delimiter;
 
         private ParserFlags _parserFlags;
 
+        /// <summary>
+        /// Initializes a new instance of the <see cref="CsvTokenizer"/> class.
+        /// </summary>
+        public CsvTokenizer()
+            : this((byte)',')
+        {
+        }
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="CsvTokenizer"/> class.
+        /// </summary>
+        /// <param name="delimiter">
+        /// The single byte to expect to see between fields of the same record.  This may not be an
+        /// end-of-line or double-quote character, as those have special meanings.
+        /// </param>
+        /// <exception cref="ArgumentException">
+        /// Thrown when <paramref name="delimiter"/> is <code>0x0A</code>, <code>0x0D</code>, or
+        /// <code>0x22</code>.
+        /// </exception>
+        public CsvTokenizer(byte delimiter)
+        {
+            switch (delimiter)
+            {
+                case CR:
+                case LF:
+                case QUOTE:
+                    throw new ArgumentException("Must not be a carriage return, linefeed, or double-quote.", nameof(delimiter));
+
+                default:
+                    _delimiter = delimiter;
+                    break;
+            }
+        }
+
         [Flags]
         private enum ParserFlags : byte
         {
@@ -189,8 +219,7 @@ public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visi
                 visitor = CsvReaderVisitorBase.Null;
             }
 
-            // cache the implicit conversion for the sake of "portable span" targets.
-            ReadOnlySpan<byte> allStopBytes = AllStopBytes;
+            byte delimiter = _delimiter;
 
             // we're going to consume the entire buffer that was handed to us.
             while (!chunk.IsEmpty)
@@ -204,17 +233,11 @@ public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visi
                     continue;
                 }
 
-                int idx = chunk.IndexOfAny(allStopBytes);
-                if (idx < 0)
-                {
-                    visitor.VisitPartialFieldContents(chunk);
-                    _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
-                    break;
-                }
-
-                switch (chunk[idx])
+                for (int idx = 0; idx < chunk.Length; idx++)
                 {
-                    case QUOTE:
+                    byte c = chunk[idx];
+                    if (c == QUOTE)
+                    {
                         if (idx == 0)
                         {
                             _parserFlags = ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
@@ -227,20 +250,30 @@ public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visi
                             visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1));
                             _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
                         }
-
-                        break;
-
-                    case COMMA:
+                    }
+                    else if (c == delimiter)
+                    {
                         visitor.VisitEndOfField(chunk.Slice(0, idx));
                         _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
-                        break;
-
-                    default:
+                    }
+                    else if (c == CR || c == LF)
+                    {
                         ProcessEndOfLine(chunk.Slice(0, idx), visitor);
-                        break;
+                    }
+                    else
+                    {
+                        continue;
+                    }
+
+                    chunk = chunk.Slice(idx + 1);
+                    goto nextLoop;
                 }
 
-                chunk = chunk.Slice(idx + 1);
+                visitor.VisitPartialFieldContents(chunk);
+                _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
+                break;
+
+                nextLoop:;
             }
         }
 
@@ -303,69 +336,69 @@ private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisi
                 }
 
                 // we have at least one more byte, so let's see what the double quote actually means
-                switch (readBuffer[idx + 1])
+                byte b = readBuffer[idx + 1];
+                if (b == QUOTE)
                 {
-                    case QUOTE:
-                        // the double quote we stopped at was escaping a literal double quote, so we
-                        // send everything up to and including the escaping quote.
-                        visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx + 1));
-                        break;
-
-                    case COMMA:
-                        // the double quote was the end of a quoted field, so send the entire data
-                        // from the beginning of this quoted field data chunk up to the double quote
-                        // that terminated it (excluding, of course, the double quote itself).
-                        visitor.VisitEndOfField(readBuffer.Slice(0, idx));
-                        _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
-                        break;
-
-                    case CR:
-                    case LF:
-                        // same thing as the COMMA case, just the field ended at the end of a line
-                        // instead of the end of a field on the current line.
-                        ProcessEndOfLine(readBuffer.Slice(0, idx), visitor);
-                        break;
-
-                    default:
-                        // the double quote was the end of the quoted part of the field data, but
-                        // then it continues on with more data; don't spend too much time optimizing
-                        // this case since it's not RFC 4180, just do the parts we need to do in
-                        // order to behave the way we said we would.
-                        _parserFlags |= ParserFlags.QuotedFieldDataEnded;
-                        visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
-                        visitor.VisitPartialFieldContents(readBuffer.Slice(idx + 1, 1));
-                        break;
+                    // the double quote we stopped at was escaping a literal double quote, so we
+                    // send everything up to and including the escaping quote.
+                    visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx + 1));
+                }
+                else if (b == _delimiter)
+                {
+                    // the double quote was the end of a quoted field, so send the entire data from
+                    // the beginning of this quoted field data chunk up to the double quote that
+                    // terminated it (excluding, of course, the double quote itself).
+                    visitor.VisitEndOfField(readBuffer.Slice(0, idx));
+                    _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
+                }
+                else if (b == CR || b == LF)
+                {
+                    // same thing as the delimiter case, just the field ended at the end of a line
+                    // instead of the end of a field on the current line.
+                    ProcessEndOfLine(readBuffer.Slice(0, idx), visitor);
+                }
+                else
+                {
+                    // the double quote was the end of the quoted part of the field data, but then
+                    // it continues on with more data; don't spend too much time optimizing this
+                    // case since it's not RFC 4180, just do the parts we need to do in order to
+                    // behave the way we said we would.
+                    _parserFlags |= ParserFlags.QuotedFieldDataEnded;
+                    visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
+                    visitor.VisitPartialFieldContents(readBuffer.Slice(idx + 1, 1));
                 }
 
                 // slice off the data up to the quote and the next byte that we read.
                 readBuffer = readBuffer.Slice(idx + 2);
-                return;
             }
-
-            // this is expected to be rare: either we were cut between field reads, or we're reading
-            // nonstandard field data where there's a quote that neither starts nor ends the field.
+            else
             {
-                int idx = readBuffer.IndexOfAny(AllStopBytesExceptQuote);
-                if (idx < 0)
+                // this is expected to be rare: either we were cut between field reads, or we're
+                // reading nonstandard field data where there's a quote that neither starts nor ends
+                // the field.
+                for (int idx = 0; idx < readBuffer.Length; idx++)
                 {
-                    visitor.VisitPartialFieldContents(readBuffer);
-                    readBuffer = default;
-                    return;
-                }
-
-                switch (readBuffer[idx])
-                {
-                    case COMMA:
+                    byte b = readBuffer[idx];
+                    if (b == _delimiter)
+                    {
                         visitor.VisitEndOfField(readBuffer.Slice(0, idx));
                         _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
-                        break;
-
-                    default:
+                    }
+                    else if (b == CR || b == LF)
+                    {
                         ProcessEndOfLine(readBuffer.Slice(0, idx), visitor);
-                        break;
+                    }
+                    else
+                    {
+                        continue;
+                    }
+
+                    readBuffer = readBuffer.Slice(idx + 1);
+                    return;
                 }
 
-                readBuffer = readBuffer.Slice(idx + 1);
+                visitor.VisitPartialFieldContents(readBuffer);
+                readBuffer = default;
             }
         }
 
@@ -379,24 +412,22 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan<by
             // the minimum amount that we need to do in order to clear this flag and get back into
             // the normal swing of things.
             _parserFlags &= ~ParserFlags.CutAtPotentiallyTerminalDoubleQuote;
-            switch (readBuffer[0])
+            if (readBuffer[0] == QUOTE)
             {
-                case QUOTE:
-                    // the previous double quote was actually there to escape this double quote.  we
-                    // didn't visit the double-quote last time because we weren't sure.  well, we're
-                    // sure now, so go ahead and do it.
-                    visitor.VisitPartialFieldContents(readBuffer.Slice(0, 1));
+                // the previous double quote was actually there to escape this double quote.  we
+                // didn't visit the double-quote last time because we weren't sure.  well, we're
+                // sure now, so go ahead and do it.
+                visitor.VisitPartialFieldContents(readBuffer.Slice(0, 1));
 
-                    // we processed the double quote, so main loop should resume at the next byte.
-                    readBuffer = readBuffer.Slice(1);
-                    break;
-
-                default:
-                    // the previous double quote did in fact terminate the quoted part of the field
-                    // data, and so all we need to do is set this flag..  main loop will re-process
-                    // this buffer and go about its merry way.
-                    _parserFlags |= ParserFlags.QuotedFieldDataEnded;
-                    break;
+                // we processed the double quote, so main loop should resume at the next byte.
+                readBuffer = readBuffer.Slice(1);
+            }
+            else
+            {
+                // the previous double quote did in fact terminate the quoted part of the field
+                // data, and so all we need to do is set this flag..  main loop will re-process this
+                // buffer and go about its merry way.
+                _parserFlags |= ParserFlags.QuotedFieldDataEnded;
             }
         }
 
diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs
index 30a987e..7279bf2 100644
--- a/test/Cursively.Benchmark/Program.cs
+++ b/test/Cursively.Benchmark/Program.cs
@@ -27,7 +27,7 @@ public long CountRowsUsingCursivelyByteArray(CsvFile csvFile)
             return visitor.RowCount;
         }
 
-        [Benchmark]
+        ////[Benchmark]
         [ArgumentsSource(nameof(CsvFiles))]
         public long CountRowsUsingCursivelyWithMemoryMappedFile(CsvFile csvFile)
         {
@@ -36,7 +36,7 @@ public long CountRowsUsingCursivelyWithMemoryMappedFile(CsvFile csvFile)
             return visitor.RowCount;
         }
 
-        [Benchmark]
+        ////[Benchmark]
         [ArgumentsSource(nameof(CsvFiles))]
         public long CountRowsUsingCsvHelper(CsvFile csvFile)
         {
diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs
index d1d2849..615c624 100644
--- a/test/Cursively.Tests/CsvTokenizerTests.cs
+++ b/test/Cursively.Tests/CsvTokenizerTests.cs
@@ -17,32 +17,37 @@ public sealed class CsvTokenizerTests
 
         private static readonly int[] TestChunkLengths = { 1, 2, 3, 5, 8, 13, 21, 34 };
 
+        private static readonly byte[] TestDelimiters = { (byte)',', (byte)'\t' };
+
         public static IEnumerable<object[]> TestCsvFiles =>
             from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv")
             select new object[] { filePath };
 
-        public static IEnumerable<object[]> TestCsvFilesWithChunkLengths =>
+        public static IEnumerable<object[]> TestCsvFilesWithChunkLengthsAndDelimiters =>
             from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv")
             let fileName = Path.GetFileNameWithoutExtension(filePath)
             from chunkLength in TestChunkLengths
-            select new object[] { fileName, chunkLength };
+            from delimiter in TestDelimiters
+            select new object[] { fileName, chunkLength, delimiter };
+
+        [Theory]
+        [InlineData((byte)0x0A)]
+        [InlineData((byte)0x0D)]
+        [InlineData((byte)0x22)]
+        public void ConstructorShouldRejectInvalidDelimiters(byte delimiter)
+        {
+            Assert.Throws<ArgumentException>("delimiter", () => new CsvTokenizer(delimiter));
+        }
 
         [Theory]
-        [MemberData(nameof(TestCsvFilesWithChunkLengths))]
-        public void NullVisitorShouldBeFine(string fileName, int chunkLength)
+        [MemberData(nameof(TestCsvFiles))]
+        public void NullVisitorShouldBeFine(string filePath)
         {
             // arrange
-            string fullCsvFilePath = Path.Combine(TestCsvFilesFolderPath, fileName + ".csv");
-            ReadOnlySpan<byte> fileData = File.ReadAllBytes(fullCsvFilePath);
+            ReadOnlySpan<byte> fileData = File.ReadAllBytes(filePath);
             var tokenizer = new CsvTokenizer();
 
             // act
-            while (fileData.Length >= chunkLength)
-            {
-                tokenizer.ProcessNextChunk(fileData.Slice(0, chunkLength), null);
-                fileData = fileData.Slice(chunkLength);
-            }
-
             tokenizer.ProcessNextChunk(fileData, null);
             tokenizer.ProcessEndOfStream(null);
 
@@ -50,20 +55,27 @@ public void NullVisitorShouldBeFine(string fileName, int chunkLength)
         }
 
         [Theory]
-        [MemberData(nameof(TestCsvFilesWithChunkLengths))]
-        public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength)
+        [MemberData(nameof(TestCsvFilesWithChunkLengthsAndDelimiters))]
+        public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength, byte delimiter)
         {
             // arrange
             byte[] fileDataTemplate = File.ReadAllBytes(Path.Combine(TestCsvFilesFolderPath, fileName + ".csv"));
+            for (int i = 0; i < fileDataTemplate.Length; i++)
+            {
+                if (fileDataTemplate[i] == (byte)',')
+                {
+                    fileDataTemplate[i] = delimiter;
+                }
+            }
 
-            int randomSeed = HashCode.Combine(fileName, chunkLength);
+            int randomSeed = HashCode.Combine(fileName, chunkLength, delimiter);
             foreach (byte[] fileData in VaryLineEndings(fileDataTemplate, randomSeed))
             {
                 // act
-                var actual = TokenizeCsvFileUsingMine(fileData, chunkLength);
+                var actual = TokenizeCsvFileUsingCursively(fileData, chunkLength, delimiter);
 
                 // assert
-                var expected = TokenizeCsvFileUsingCsvHelper(fileData);
+                var expected = TokenizeCsvFileUsingCsvHelper(fileData, $"{(char)delimiter}");
                 Assert.Equal(expected, actual);
             }
         }
@@ -80,13 +92,13 @@ public void MemoryMappedCsvShouldMatchCsvHelper(string filePath)
             var actual = visitor.Lines;
 
             // assert
-            var expected = TokenizeCsvFileUsingCsvHelper(File.ReadAllBytes(filePath));
+            var expected = TokenizeCsvFileUsingCsvHelper(File.ReadAllBytes(filePath), ",");
             Assert.Equal(expected, actual);
         }
 
-        private static List<string[]> TokenizeCsvFileUsingMine(ReadOnlySpan<byte> fileData, int chunkLength)
+        private static List<string[]> TokenizeCsvFileUsingCursively(ReadOnlySpan<byte> fileData, int chunkLength, byte delimiter)
         {
-            var tokenizer = new CsvTokenizer();
+            var tokenizer = new CsvTokenizer(delimiter);
             var visitor = new StringBufferingVisitor(fileData.Length);
             while (fileData.Length >= chunkLength)
             {
@@ -99,11 +111,11 @@ private static List<string[]> TokenizeCsvFileUsingMine(ReadOnlySpan<byte> fileDa
             return visitor.Lines;
         }
 
-        private static IEnumerable<string[]> TokenizeCsvFileUsingCsvHelper(byte[] csvData)
+        private static IEnumerable<string[]> TokenizeCsvFileUsingCsvHelper(byte[] csvData, string delimiter)
         {
             using (var stream = new MemoryStream(csvData, false))
             using (var streamReader = new StreamReader(stream, new UTF8Encoding(false, false), false))
-            using (var csvReader = new CsvReader(streamReader, new Configuration { BadDataFound = null }))
+            using (var csvReader = new CsvReader(streamReader, new Configuration { BadDataFound = null, Delimiter = delimiter }))
             {
                 while (csvReader.Read())
                 {

From b3be94f654beb2b03049bdb2bfd7e417453c4a72 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Wed, 5 Jun 2019 10:13:09 -0400
Subject: [PATCH 05/22] Try setting up LGTM

---
 Directory.Build.props     | 2 +-
 lgtm.yml                  | 4 ++++
 src/Directory.Build.props | 5 ++++-
 3 files changed, 9 insertions(+), 2 deletions(-)
 create mode 100644 lgtm.yml

diff --git a/Directory.Build.props b/Directory.Build.props
index b6bbf76..491987b 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -8,10 +8,10 @@
     <GenerateDocumentation>true</GenerateDocumentation>
 
     <LangVersion>7.3</LangVersion>
-    <Deterministic>true</Deterministic>
   </PropertyGroup>
 
   <PropertyGroup Condition=" '$(APPVEYOR)' == 'True' ">
+    <Deterministic>true</Deterministic>
     <ContinuousIntegrationBuild>true</ContinuousIntegrationBuild>
   </PropertyGroup>
 
diff --git a/lgtm.yml b/lgtm.yml
new file mode 100644
index 0000000..1052cbe
--- /dev/null
+++ b/lgtm.yml
@@ -0,0 +1,4 @@
+extraction:
+  csharp:
+    after_prepare:
+      - export LGTM=true
diff --git a/src/Directory.Build.props b/src/Directory.Build.props
index a83c324..4b420f8 100644
--- a/src/Directory.Build.props
+++ b/src/Directory.Build.props
@@ -21,8 +21,11 @@
     <GenerateGitVersionInformation>false</GenerateGitVersionInformation>
   </PropertyGroup>
 
-  <ItemGroup>
+  <ItemGroup Condition=" '$(LGTM)' != 'true' ">
     <PackageReference Include="GitVersionTask" Version="5.0.0-beta3-4" PrivateAssets="All" />
+  </ItemGroup>
+
+  <ItemGroup>
     <PackageReference Include="Microsoft.CodeAnalysis.FxCopAnalyzers" Version="2.9.2" PrivateAssets="All" />
   </ItemGroup>
 

From c2e3f78fb32bbd6a815ada10ab15062dde5a8253 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Wed, 5 Jun 2019 10:22:25 -0400
Subject: [PATCH 06/22] One more try to get LGTM to work.

---
 Directory.Build.props | 2 +-
 lgtm.yml              | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Directory.Build.props b/Directory.Build.props
index 491987b..b10b8ad 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -15,7 +15,7 @@
     <ContinuousIntegrationBuild>true</ContinuousIntegrationBuild>
   </PropertyGroup>
 
-  <ItemGroup>
+  <ItemGroup Condition=" '$(LGTM)' != 'true' ">
     <PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.0.0-beta2-18618-05" PrivateAssets="All" />
   </ItemGroup>
 
diff --git a/lgtm.yml b/lgtm.yml
index 1052cbe..0c3c07c 100644
--- a/lgtm.yml
+++ b/lgtm.yml
@@ -2,3 +2,5 @@ extraction:
   csharp:
     after_prepare:
       - export LGTM=true
+    index:
+      solution: src/Cursively/Cursively.csproj

From c973ecc27ed8985a5d9354d695e2b4eda58fdd2f Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Thu, 6 Jun 2019 08:59:42 -0400
Subject: [PATCH 07/22] Add worldcitiespop.csv for benchmarking. Apparently,
 this is a more-or-less standard CSV processing benchmark file.

---
 README.md                                     | 10 +--
 test/Cursively.Benchmark/Program.cs           | 83 +++++++++++++++++--
 .../large-csv-files/worldcitiespop.csv        |  3 +
 3 files changed, 84 insertions(+), 12 deletions(-)
 create mode 100644 test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv

diff --git a/README.md b/README.md
index b1382b8..7b8411d 100644
--- a/README.md
+++ b/README.md
@@ -79,14 +79,12 @@ public sealed class MyVisitor : CsvReaderVisitorBase
             throw new InvalidDataException($"Field is longer than {_buffer.Length} characters.");
         }
 
-        if (!flush)
+        if (flush)
         {
-            return;
+            Console.Write("Field: ");
+            Console.WriteLine(_buffer, 0, _bufferConsumed);
+            _bufferConsumed = 0;
         }
-
-        Console.Write("Field: ");
-        Console.WriteLine(_buffer, 0, _bufferConsumed);
-        _bufferConsumed = 0;
     }
 }
 ```
diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs
index 7279bf2..b45fcd6 100644
--- a/test/Cursively.Benchmark/Program.cs
+++ b/test/Cursively.Benchmark/Program.cs
@@ -11,9 +11,80 @@
 
 namespace Cursively.Benchmark
 {
-    [ClrJob, CoreJob, GcServer(true), MemoryDiagnoser]
+    [ClrJob]
+    [CoreJob]
+    [GcServer(true)]
+    [MemoryDiagnoser]
     public class Program
     {
+        public static void ProcessCsvFile(string csvFilePath)
+        {
+            var myVisitor = new MyVisitor(maxFieldLength: 1000);
+            var tokenizer = new CsvTokenizer();
+            using (var file = File.OpenRead(csvFilePath))
+            {
+                Console.WriteLine($"Started reading '{csvFilePath}'.");
+                Span<byte> fileReadBuffer = new byte[4096];
+                while (true)
+                {
+                    int count = file.Read(fileReadBuffer);
+                    if (count == 0)
+                    {
+                        break;
+                    }
+
+                    var chunk = fileReadBuffer.Slice(0, count);
+                    tokenizer.ProcessNextChunk(chunk, myVisitor);
+                }
+
+                tokenizer.ProcessEndOfStream(myVisitor);
+            }
+
+            Console.WriteLine($"Finished reading '{csvFilePath}'.");
+        }
+
+        public sealed class MyVisitor : CsvReaderVisitorBase
+        {
+            private readonly Decoder _utf8Decoder = Encoding.UTF8.GetDecoder();
+
+            private readonly char[] _buffer;
+
+            private int _bufferConsumed;
+
+            public MyVisitor(int maxFieldLength) =>
+                _buffer = new char[maxFieldLength];
+
+            public override void VisitPartialFieldContents(ReadOnlySpan<byte> chunk) =>
+                VisitFieldContents(chunk, flush: false);
+
+            public override void VisitEndOfField(ReadOnlySpan<byte> chunk) =>
+                VisitFieldContents(chunk, flush: true);
+
+            public override void VisitEndOfRecord() =>
+                Console.WriteLine("End of fields for this record.");
+
+            private void VisitFieldContents(ReadOnlySpan<byte> chunk, bool flush)
+            {
+                int charCount = _utf8Decoder.GetCharCount(chunk, flush);
+                if (charCount + _bufferConsumed <= _buffer.Length)
+                {
+                    _utf8Decoder.GetChars(chunk, new Span<char>(_buffer, _bufferConsumed, charCount), flush);
+                    _bufferConsumed += charCount;
+                }
+                else
+                {
+                    throw new InvalidDataException($"Field is longer than {_buffer.Length} characters.");
+                }
+
+                if (flush)
+                {
+                    Console.Write("Field: ");
+                    Console.WriteLine(_buffer, 0, _bufferConsumed);
+                    _bufferConsumed = 0;
+                }
+            }
+        }
+
         public static CsvFile[] CsvFiles => GetCsvFiles();
 
         [Benchmark(Baseline = true)]
@@ -27,7 +98,7 @@ public long CountRowsUsingCursivelyByteArray(CsvFile csvFile)
             return visitor.RowCount;
         }
 
-        ////[Benchmark]
+        [Benchmark]
         [ArgumentsSource(nameof(CsvFiles))]
         public long CountRowsUsingCursivelyWithMemoryMappedFile(CsvFile csvFile)
         {
@@ -36,7 +107,7 @@ public long CountRowsUsingCursivelyWithMemoryMappedFile(CsvFile csvFile)
             return visitor.RowCount;
         }
 
-        ////[Benchmark]
+        [Benchmark]
         [ArgumentsSource(nameof(CsvFiles))]
         public long CountRowsUsingCsvHelper(CsvFile csvFile)
         {
@@ -59,7 +130,9 @@ private static int Main()
             var prog = new Program();
             foreach (var csvFile in CsvFiles)
             {
-                if (prog.CountRowsUsingCursivelyByteArray(csvFile) != prog.CountRowsUsingCsvHelper(csvFile))
+                long rowCount = prog.CountRowsUsingCursivelyByteArray(csvFile);
+                if (prog.CountRowsUsingCsvHelper(csvFile) != rowCount ||
+                    prog.CountRowsUsingCursivelyWithMemoryMappedFile(csvFile) != rowCount)
                 {
                     Console.Error.WriteLine($"Failed on {csvFile}.");
                     return 1;
@@ -90,8 +163,6 @@ private static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null) =
 
         private sealed class RowCountingVisitor : CsvReaderVisitorBase
         {
-            public long CharCount { get; private set; }
-
             public long RowCount { get; private set; }
 
             public override void VisitEndOfRecord() => ++RowCount;
diff --git a/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv b/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv
new file mode 100644
index 0000000..01dfbfe
--- /dev/null
+++ b/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4c7824338bbbc228f5d3e0089c57233136e83853821252ade7ed556b4bcfc1b
+size 151492068

From 0d4a907f26b1d292513ceaf7b5399ef347bcb97a Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Thu, 6 Jun 2019 09:04:15 -0400
Subject: [PATCH 08/22] Revert the sample code accidentally added here.

---
 test/Cursively.Benchmark/Program.cs | 68 -----------------------------
 1 file changed, 68 deletions(-)

diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs
index b45fcd6..ab36394 100644
--- a/test/Cursively.Benchmark/Program.cs
+++ b/test/Cursively.Benchmark/Program.cs
@@ -17,74 +17,6 @@ namespace Cursively.Benchmark
     [MemoryDiagnoser]
     public class Program
     {
-        public static void ProcessCsvFile(string csvFilePath)
-        {
-            var myVisitor = new MyVisitor(maxFieldLength: 1000);
-            var tokenizer = new CsvTokenizer();
-            using (var file = File.OpenRead(csvFilePath))
-            {
-                Console.WriteLine($"Started reading '{csvFilePath}'.");
-                Span<byte> fileReadBuffer = new byte[4096];
-                while (true)
-                {
-                    int count = file.Read(fileReadBuffer);
-                    if (count == 0)
-                    {
-                        break;
-                    }
-
-                    var chunk = fileReadBuffer.Slice(0, count);
-                    tokenizer.ProcessNextChunk(chunk, myVisitor);
-                }
-
-                tokenizer.ProcessEndOfStream(myVisitor);
-            }
-
-            Console.WriteLine($"Finished reading '{csvFilePath}'.");
-        }
-
-        public sealed class MyVisitor : CsvReaderVisitorBase
-        {
-            private readonly Decoder _utf8Decoder = Encoding.UTF8.GetDecoder();
-
-            private readonly char[] _buffer;
-
-            private int _bufferConsumed;
-
-            public MyVisitor(int maxFieldLength) =>
-                _buffer = new char[maxFieldLength];
-
-            public override void VisitPartialFieldContents(ReadOnlySpan<byte> chunk) =>
-                VisitFieldContents(chunk, flush: false);
-
-            public override void VisitEndOfField(ReadOnlySpan<byte> chunk) =>
-                VisitFieldContents(chunk, flush: true);
-
-            public override void VisitEndOfRecord() =>
-                Console.WriteLine("End of fields for this record.");
-
-            private void VisitFieldContents(ReadOnlySpan<byte> chunk, bool flush)
-            {
-                int charCount = _utf8Decoder.GetCharCount(chunk, flush);
-                if (charCount + _bufferConsumed <= _buffer.Length)
-                {
-                    _utf8Decoder.GetChars(chunk, new Span<char>(_buffer, _bufferConsumed, charCount), flush);
-                    _bufferConsumed += charCount;
-                }
-                else
-                {
-                    throw new InvalidDataException($"Field is longer than {_buffer.Length} characters.");
-                }
-
-                if (flush)
-                {
-                    Console.Write("Field: ");
-                    Console.WriteLine(_buffer, 0, _bufferConsumed);
-                    _bufferConsumed = 0;
-                }
-            }
-        }
-
         public static CsvFile[] CsvFiles => GetCsvFiles();
 
         [Benchmark(Baseline = true)]

From 2a310f1b5d1111afb96992cb9e7895c958648dee Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Thu, 13 Jun 2019 08:10:39 -0400
Subject: [PATCH 09/22] Add helpers for processing a Stream more easily.
 Resolves #10

---
 src/Cursively/Csv.cs                      | 277 +++++++++++++++++++++-
 src/Cursively/Cursively.csproj            |   1 +
 test/Cursively.Benchmark/Program.cs       |  12 +-
 test/Cursively.Tests/CsvTokenizerTests.cs |   2 +-
 4 files changed, 269 insertions(+), 23 deletions(-)

diff --git a/src/Cursively/Csv.cs b/src/Cursively/Csv.cs
index e23740b..917992a 100644
--- a/src/Cursively/Csv.cs
+++ b/src/Cursively/Csv.cs
@@ -1,6 +1,9 @@
 ﻿using System;
 using System.IO;
 using System.IO.MemoryMappedFiles;
+using System.Runtime.CompilerServices;
+using System.Threading;
+using System.Threading.Tasks;
 
 namespace Cursively
 {
@@ -10,8 +13,224 @@ namespace Cursively
     public static class Csv
     {
         /// <summary>
-        /// Describes the contents of a CSV file to the given instance of the
-        /// <see cref="CsvReaderVisitorBase"/> class, using memory-mapped files behind the scenes.
+        /// Describes the contents of a CSV stream to the given instance of the
+        /// <see cref="CsvReaderVisitorBase"/> class.
+        /// </summary>
+        /// <param name="csvStream">
+        /// The CSV stream to describe.
+        /// </param>
+        /// <param name="visitor">
+        /// The <see cref="CsvReaderVisitorBase"/> instance to describe the stream to.
+        /// </param>
+        /// <exception cref="ArgumentNullException">
+        /// Thrown when <paramref name="csvStream"/> is <see langword="null"/>.
+        /// </exception>
+        public static void ProcessStream(Stream csvStream, CsvReaderVisitorBase visitor) =>
+            ProcessStream(csvStream, visitor, 81920);
+
+        /// <summary>
+        /// Describes the contents of a CSV stream to the given instance of the
+        /// <see cref="CsvReaderVisitorBase"/> class.
+        /// </summary>
+        /// <param name="csvStream">
+        /// The CSV stream to describe.
+        /// </param>
+        /// <param name="visitor">
+        /// The <see cref="CsvReaderVisitorBase"/> instance to describe the stream to.
+        /// </param>
+        /// <param name="bufferSize">
+        /// The length of the buffer to use (default: 81920).
+        /// </param>
+        /// <exception cref="ArgumentNullException">
+        /// Thrown when <paramref name="csvStream"/> is <see langword="null"/>.
+        /// </exception>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// Thrown when <paramref name="bufferSize"/> is not greater than zero.
+        /// </exception>
+        /// <exception cref="ArgumentException">
+        /// Thrown when <paramref name="csvStream"/> does not support reading (i.e.,
+        /// <see cref="Stream.CanRead"/> is <see langword="false"/>).
+        /// </exception>
+        public static void ProcessStream(Stream csvStream, CsvReaderVisitorBase visitor, int bufferSize)
+        {
+            if (csvStream is null)
+            {
+                throw new ArgumentNullException(nameof(csvStream));
+            }
+
+            if (bufferSize <= 0)
+            {
+                throw new ArgumentOutOfRangeException(nameof(bufferSize), bufferSize, "Must be greater than zero.");
+            }
+
+            if (!csvStream.CanRead)
+            {
+                throw new ArgumentException("Stream does not support reading.", nameof(csvStream));
+            }
+
+            byte[] buffer = new byte[bufferSize];
+            var tokenizer = new CsvTokenizer();
+            int cnt;
+            while ((cnt = csvStream.Read(buffer, 0, buffer.Length)) != 0)
+            {
+                tokenizer.ProcessNextChunk(new ReadOnlySpan<byte>(buffer, 0, cnt), visitor);
+            }
+
+            tokenizer.ProcessEndOfStream(visitor);
+        }
+
+        /// <summary>
+        /// Describes the contents of a CSV stream to the given instance of the
+        /// <see cref="CsvReaderVisitorBase"/> class.
+        /// </summary>
+        /// <param name="csvStream">
+        /// The CSV stream to describe.
+        /// </param>
+        /// <param name="visitor">
+        /// The <see cref="CsvReaderVisitorBase"/> instance to describe the stream to.
+        /// </param>
+        /// <param name="progress">
+        /// <para>
+        /// An <see cref="IProgress{T}"/> that will be notified every time the next chunk of the
+        /// stream is processed, with the size of the chunk (in bytes) that was processed.
+        /// </para>
+        /// <para>
+        /// All notifications will receive values less than or equal to the buffer size in bytes
+        /// (which, for this overload, is the default value of 81,920).
+        /// </para>
+        /// <para>
+        /// There will be one last notification with value 0 after the entire stream has been
+        /// processed and the final few stream elements have been consumed.
+        /// </para>
+        /// <para>
+        /// This may be left as <see langword="null"/> if no progress notifications are needed.
+        /// </para>
+        /// </param>
+        /// <param name="cancellationToken">
+        /// <para>
+        /// An instance of <see cref="CancellationToken"/> that may be used to signal that results
+        /// are no longer needed, and so the method should terminate at its earliest convenience.
+        /// </para>
+        /// <para>
+        /// This may be left as its default value of <see cref="CancellationToken.None"/> if the
+        /// operation does not need to support cancellation.
+        /// </para>
+        /// </param>
+        /// <exception cref="ArgumentNullException">
+        /// Thrown when <paramref name="csvStream"/> is <see langword="null"/>.
+        /// </exception>
+        /// <exception cref="ArgumentException">
+        /// Thrown when <paramref name="csvStream"/> does not support reading (i.e.,
+        /// <see cref="Stream.CanRead"/> is <see langword="false"/>).
+        /// </exception>
+        /// <exception cref="OperationCanceledException">
+        /// Thrown (perhaps asynchronously) to acknowledge cancellation.  A derived exception, such
+        /// as <see cref="TaskCanceledException"/>, may also be thrown by the system.
+        /// </exception>
+        /// <exception cref="ObjectDisposedException">
+        /// Thrown (perhaps asynchronously) if the underlying <see cref="CancellationTokenSource"/>
+        /// object backing <paramref name="cancellationToken"/> is disposed before the asynchronous
+        /// operation terminates.
+        /// </exception>
+        public static ValueTask ProcessStreamAsync(Stream csvStream, CsvReaderVisitorBase visitor, IProgress<int> progress = null, CancellationToken cancellationToken = default) =>
+            ProcessStreamAsync(csvStream, visitor, 81920, progress, cancellationToken);
+
+        /// <summary>
+        /// Describes the contents of a CSV stream to the given instance of the
+        /// <see cref="CsvReaderVisitorBase"/> class.
+        /// </summary>
+        /// <param name="csvStream">
+        /// The CSV stream to describe.
+        /// </param>
+        /// <param name="visitor">
+        /// The <see cref="CsvReaderVisitorBase"/> instance to describe the stream to.
+        /// </param>
+        /// <param name="bufferSize">
+        /// The length of the buffer to use (default: 81920).
+        /// </param>
+        /// <param name="progress">
+        /// <para>
+        /// An <see cref="IProgress{T}"/> that will be notified every time the next chunk of the
+        /// stream is processed, with the size of the chunk (in bytes) that was processed.
+        /// </para>
+        /// <para>
+        /// All notifications will receive values less than or equal to the buffer size in bytes
+        /// (which, for this overload, is the value of <paramref name="bufferSize"/>).
+        /// </para>
+        /// <para>
+        /// There will be one last notification with value 0 after the entire stream has been
+        /// processed and the final few stream elements have been consumed.
+        /// </para>
+        /// <para>
+        /// This may be left as <see langword="null"/> if no progress notifications are needed.
+        /// </para>
+        /// </param>
+        /// <param name="cancellationToken">
+        /// <para>
+        /// An instance of <see cref="CancellationToken"/> that may be used to signal that results
+        /// are no longer needed, and so the method should terminate at its earliest convenience.
+        /// </para>
+        /// <para>
+        /// This may be left as its default value of <see cref="CancellationToken.None"/> if the
+        /// operation does not need to support cancellation.
+        /// </para>
+        /// </param>
+        /// <exception cref="ArgumentNullException">
+        /// Thrown when <paramref name="csvStream"/> is <see langword="null"/>.
+        /// </exception>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// Thrown when <paramref name="bufferSize"/> is not greater than zero.
+        /// </exception>
+        /// <exception cref="ArgumentException">
+        /// Thrown when <paramref name="csvStream"/> does not support reading (i.e.,
+        /// <see cref="Stream.CanRead"/> is <see langword="false"/>).
+        /// </exception>
+        /// <exception cref="OperationCanceledException">
+        /// Thrown (perhaps asynchronously) to acknowledge cancellation.  A derived exception, such
+        /// as <see cref="TaskCanceledException"/>, may also be thrown by the system.
+        /// </exception>
+        /// <exception cref="ObjectDisposedException">
+        /// Thrown (perhaps asynchronously) if the underlying <see cref="CancellationTokenSource"/>
+        /// object backing <paramref name="cancellationToken"/> is disposed before the asynchronous
+        /// operation terminates.
+        /// </exception>
+        public static async ValueTask ProcessStreamAsync(Stream csvStream, CsvReaderVisitorBase visitor, int bufferSize, IProgress<int> progress = null, CancellationToken cancellationToken = default)
+        {
+            if (csvStream is null)
+            {
+                throw new ArgumentNullException(nameof(csvStream));
+            }
+
+            if (bufferSize <= 0)
+            {
+                throw new ArgumentOutOfRangeException(nameof(bufferSize), bufferSize, "Must be greater than zero.");
+            }
+
+            if (!csvStream.CanRead)
+            {
+                throw new ArgumentException("Stream does not support reading.", nameof(csvStream));
+            }
+
+            byte[] buffer = new byte[bufferSize];
+            var tokenizer = new CsvTokenizer();
+            int cnt;
+            while ((cnt = await csvStream.ReadAsync(buffer, 0, buffer.Length, cancellationToken).ConfigureAwait(false)) != 0)
+            {
+                tokenizer.ProcessNextChunk(new ReadOnlySpan<byte>(buffer, 0, cnt), visitor);
+                progress?.Report(cnt);
+
+                // not all streams support cancellation, so we might as well do this ourselves.  it
+                // does involve a volatile read, so don't go overboard.
+                cancellationToken.ThrowIfCancellationRequested();
+            }
+
+            tokenizer.ProcessEndOfStream(visitor);
+            progress?.Report(0);
+        }
+
+        /// <summary>
+        /// Describes the entire contents of a CSV file to the given instance of the
+        /// <see cref="CsvReaderVisitorBase"/> class.
         /// </summary>
         /// <param name="csvFilePath">
         /// The path to the CSV file to describe.
@@ -19,9 +238,46 @@ public static class Csv
         /// <param name="visitor">
         /// The <see cref="CsvReaderVisitorBase"/> instance to describe the file to.
         /// </param>
-        public static unsafe void ProcessMemoryMappedFile(string csvFilePath, CsvReaderVisitorBase visitor)
+        /// <remarks>
+        /// The current version of this method uses memory-mapping behind the scenes in order to
+        /// minimize the overhead of copying and cutting across discrete buffers, at the expense of
+        /// slightly more overhead to set up the memory map than a typical read-from-stream pattern.
+        /// </remarks>
+        /// <exception cref="ArgumentNullException">
+        /// See <see cref="FileStream(string, FileMode, FileAccess, FileShare, int, FileOptions)"/>.
+        /// </exception>
+        /// <exception cref="ArgumentException">
+        /// See <see cref="FileStream(string, FileMode, FileAccess, FileShare, int, FileOptions)"/>.
+        /// </exception>
+        /// <exception cref="NotSupportedException">
+        /// See <see cref="FileStream(string, FileMode, FileAccess, FileShare, int, FileOptions)"/>.
+        /// </exception>
+        /// <exception cref="FileNotFoundException">
+        /// See <see cref="FileStream(string, FileMode, FileAccess, FileShare, int, FileOptions)"/>.
+        /// </exception>
+        /// <exception cref="IOException">
+        /// <para>
+        /// See <see cref="FileStream(string, FileMode, FileAccess, FileShare, int, FileOptions)"/>.
+        /// </para>
+        /// <para>
+        /// See <see cref="MemoryMappedFile.CreateViewAccessor(long, long, MemoryMappedFileAccess)"/>.
+        /// </para>
+        /// </exception>
+        /// <exception cref="System.Security.SecurityException">
+        /// See <see cref="FileStream(string, FileMode, FileAccess, FileShare, int, FileOptions)"/>.
+        /// </exception>
+        /// <exception cref="DirectoryNotFoundException">
+        /// See <see cref="FileStream(string, FileMode, FileAccess, FileShare, int, FileOptions)"/>.
+        /// </exception>
+        /// <exception cref="UnauthorizedAccessException">
+        /// See <see cref="FileStream(string, FileMode, FileAccess, FileShare, int, FileOptions)"/>.
+        /// </exception>
+        /// <exception cref="PathTooLongException">
+        /// See <see cref="FileStream(string, FileMode, FileAccess, FileShare, int, FileOptions)"/>.
+        /// </exception>
+        public static unsafe void ProcessEntireFile(string csvFilePath, CsvReaderVisitorBase visitor)
         {
-            using (var fl = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read))
+            using (var fl = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan))
             {
                 long length = fl.Length;
                 if (length == 0)
@@ -35,19 +291,18 @@ public static unsafe void ProcessMemoryMappedFile(string csvFilePath, CsvReaderV
                 {
                     var handle = accessor.SafeMemoryMappedViewHandle;
                     byte* ptr = null;
+                    RuntimeHelpers.PrepareConstrainedRegions();
                     try
                     {
                         handle.AcquirePointer(ref ptr);
-                        for (long rem = length; rem > 0; rem -= int.MaxValue)
+                        while (length > int.MaxValue)
                         {
-                            int currentChunkLength = rem < int.MaxValue
-                                ? unchecked((int)rem)
-                                : int.MaxValue;
-
-                            var span = new ReadOnlySpan<byte>(ptr, currentChunkLength);
-                            tokenizer.ProcessNextChunk(span, visitor);
+                            tokenizer.ProcessNextChunk(new ReadOnlySpan<byte>(ptr, int.MaxValue), visitor);
+                            length -= int.MaxValue;
+                            ptr += int.MaxValue;
                         }
 
+                        tokenizer.ProcessNextChunk(new ReadOnlySpan<byte>(ptr, unchecked((int)length)), visitor);
                         tokenizer.ProcessEndOfStream(visitor);
                     }
                     finally
diff --git a/src/Cursively/Cursively.csproj b/src/Cursively/Cursively.csproj
index 6f5c7ae..06e06e7 100644
--- a/src/Cursively/Cursively.csproj
+++ b/src/Cursively/Cursively.csproj
@@ -17,6 +17,7 @@
 
   <ItemGroup>
     <PackageReference Include="System.Memory" Version="4.5.3" />
+    <PackageReference Include="System.Threading.Tasks.Extensions" Version="4.5.2" />
   </ItemGroup>
 
 </Project>
diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs
index ab36394..943359a 100644
--- a/test/Cursively.Benchmark/Program.cs
+++ b/test/Cursively.Benchmark/Program.cs
@@ -30,15 +30,6 @@ public long CountRowsUsingCursivelyByteArray(CsvFile csvFile)
             return visitor.RowCount;
         }
 
-        [Benchmark]
-        [ArgumentsSource(nameof(CsvFiles))]
-        public long CountRowsUsingCursivelyWithMemoryMappedFile(CsvFile csvFile)
-        {
-            var visitor = new RowCountingVisitor();
-            Csv.ProcessMemoryMappedFile(csvFile.FullPath, visitor);
-            return visitor.RowCount;
-        }
-
         [Benchmark]
         [ArgumentsSource(nameof(CsvFiles))]
         public long CountRowsUsingCsvHelper(CsvFile csvFile)
@@ -63,8 +54,7 @@ private static int Main()
             foreach (var csvFile in CsvFiles)
             {
                 long rowCount = prog.CountRowsUsingCursivelyByteArray(csvFile);
-                if (prog.CountRowsUsingCsvHelper(csvFile) != rowCount ||
-                    prog.CountRowsUsingCursivelyWithMemoryMappedFile(csvFile) != rowCount)
+                if (prog.CountRowsUsingCsvHelper(csvFile) != rowCount)
                 {
                     Console.Error.WriteLine($"Failed on {csvFile}.");
                     return 1;
diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs
index 615c624..76a5717 100644
--- a/test/Cursively.Tests/CsvTokenizerTests.cs
+++ b/test/Cursively.Tests/CsvTokenizerTests.cs
@@ -88,7 +88,7 @@ public void MemoryMappedCsvShouldMatchCsvHelper(string filePath)
             var visitor = new StringBufferingVisitor(checked((int)new FileInfo(filePath).Length));
 
             // act
-            Csv.ProcessMemoryMappedFile(filePath, visitor);
+            Csv.ProcessEntireFile(filePath, visitor);
             var actual = visitor.Lines;
 
             // assert

From 8d364ab82a80fc800f07aa17b65c08faf09d8a0b Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Thu, 13 Jun 2019 09:45:39 -0400
Subject: [PATCH 10/22] Always set flags before invoking the visitor

---
 src/Cursively/CsvTokenizer.cs | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs
index 1a44af0..b8fdaf6 100644
--- a/src/Cursively/CsvTokenizer.cs
+++ b/src/Cursively/CsvTokenizer.cs
@@ -233,6 +233,10 @@ public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visi
                     continue;
                 }
 
+                // loop one-by-one, instead of doing an IndexOfAny, greedily assuming that the most
+                // performance-sensitive applications will tend to have few enough bytes in each
+                // unquoted field that this manual inlining will benefit those applications **much**
+                // more than practically any IndexOfAny implementation would.
                 for (int idx = 0; idx < chunk.Length; idx++)
                 {
                     byte c = chunk[idx];
@@ -247,14 +251,14 @@ public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visi
                             // RFC 4180 forbids quotes that show up anywhere but the beginning of a
                             // field, so it's up to us to decide what we want to do about this.  We
                             // choose to treat all such quotes as just regular data.
-                            visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1));
                             _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
+                            visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1));
                         }
                     }
                     else if (c == delimiter)
                     {
-                        visitor.VisitEndOfField(chunk.Slice(0, idx));
                         _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
+                        visitor.VisitEndOfField(chunk.Slice(0, idx));
                     }
                     else if (c == CR || c == LF)
                     {
@@ -269,8 +273,8 @@ public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visi
                     goto nextLoop;
                 }
 
-                visitor.VisitPartialFieldContents(chunk);
                 _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
+                visitor.VisitPartialFieldContents(chunk);
                 break;
 
                 nextLoop:;
@@ -329,8 +333,8 @@ private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisi
                     // in fact, it should pay off so well in so many cases that we can probably even
                     // get away with making the other case really suboptimal, which is what it will
                     // do when we pick up where we leave off after setting this flag.
-                    visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
                     _parserFlags |= ParserFlags.CutAtPotentiallyTerminalDoubleQuote;
+                    visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
                     readBuffer = default;
                     return;
                 }
@@ -348,8 +352,8 @@ private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisi
                     // the double quote was the end of a quoted field, so send the entire data from
                     // the beginning of this quoted field data chunk up to the double quote that
                     // terminated it (excluding, of course, the double quote itself).
-                    visitor.VisitEndOfField(readBuffer.Slice(0, idx));
                     _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
+                    visitor.VisitEndOfField(readBuffer.Slice(0, idx));
                 }
                 else if (b == CR || b == LF)
                 {
@@ -381,8 +385,8 @@ private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisi
                     byte b = readBuffer[idx];
                     if (b == _delimiter)
                     {
-                        visitor.VisitEndOfField(readBuffer.Slice(0, idx));
                         _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
+                        visitor.VisitEndOfField(readBuffer.Slice(0, idx));
                     }
                     else if (b == CR || b == LF)
                     {
@@ -433,16 +437,17 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan<by
 
         private void ProcessEndOfLine(ReadOnlySpan<byte> lastFieldDataChunk, CsvReaderVisitorBase visitor)
         {
-            if (!lastFieldDataChunk.IsEmpty || (_parserFlags & ParserFlags.ReadAnythingOnCurrentLine) != 0)
+            // even if the last field data chunk is empty, we still need to send it: we might be
+            // looking at a newline that immediately follows a comma, which is defined to mean
+            // an empty field at the end of a line.
+            bool notify = !lastFieldDataChunk.IsEmpty || (_parserFlags & ParserFlags.ReadAnythingOnCurrentLine) != 0;
+
+            _parserFlags = ParserFlags.None;
+            if (notify)
             {
-                // even if the last field data chunk is empty, we still need to send it: we might be
-                // looking at a newline that immediately follows a comma, which is defined to mean
-                // an empty field at the end of a line.
                 visitor.VisitEndOfField(lastFieldDataChunk);
                 visitor.VisitEndOfRecord();
             }
-
-            _parserFlags = ParserFlags.None;
         }
     }
 }

From 35165fffcab8b8fa8fd1a21aa8bacaae67413781 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Thu, 13 Jun 2019 09:45:40 -0400
Subject: [PATCH 11/22] Give consumers a way to detect nonstandard data.

Resolves #4
---
 src/Cursively/CsvReaderVisitorBase.cs | 36 +++++++++++++++++++++++++--
 src/Cursively/CsvTokenizer.cs         | 22 ++++++++++++++--
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs
index 5b0eea6..0a748fc 100644
--- a/src/Cursively/CsvReaderVisitorBase.cs
+++ b/src/Cursively/CsvReaderVisitorBase.cs
@@ -29,8 +29,9 @@ public abstract class CsvReaderVisitorBase
         /// This method may be called at any time.
         /// </para>
         /// <para>
-        /// Only <see cref="VisitPartialFieldContents"/> and <see cref="VisitEndOfField"/> may be
-        /// called directly after a call to this method.
+        /// Only <see cref="VisitPartialFieldContents"/>, <see cref="VisitEndOfField"/>, and
+        /// <see cref="VisitNonstandardQuotedField"/> may be called directly after a call to this
+        /// method.
         /// </para>
         /// <para>
         /// There are multiple reasons why this method may be called instead of going straight to
@@ -94,6 +95,37 @@ public abstract class CsvReaderVisitorBase
         /// </remarks>
         public abstract void VisitEndOfRecord();
 
+        /// <summary>
+        /// <para>
+        /// Notifies that the current field contains double-quote characters that do not comply with
+        /// RFC 4180, and so it is being processed according to this library's extra rules.
+        /// </para>
+        /// <para>
+        /// The default behavior of this method is to do nothing.  Subclasses may wish to override
+        /// to add warnings / errors when processing streams that do not follow RFC 4180 and are
+        /// therefore in danger of being processed differently than other tools.
+        /// </para>
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// This method may only be called after a call to <see cref="VisitPartialFieldContents"/>,
+        /// at most once per field (i.e., once it is called, it may not be called again until the
+        /// next call to <see cref="VisitEndOfField"/>).
+        /// </para>
+        /// <para>
+        /// Only <see cref="VisitPartialFieldContents"/> and <see cref="VisitEndOfField"/> may be
+        /// called directly after a call to this method.
+        /// </para>
+        /// <para>
+        /// Once called, the entire field described by all preceding consecutive calls to
+        /// <see cref="VisitPartialFieldContents"/> calls, and all successive calls up to the next
+        /// <see cref="VisitEndOfField"/>, are considered to be "nonstandard".  That means that this
+        /// method may be considered to affect the correctness of previous method calls, depending
+        /// on the semantics of the override.
+        /// </para>
+        /// </remarks>
+        public virtual void VisitNonstandardQuotedField() { }
+
         private sealed class NullVisitor : CsvReaderVisitorBase
         {
             public override void VisitEndOfRecord() { }
diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs
index b8fdaf6..c04ade4 100644
--- a/src/Cursively/CsvTokenizer.cs
+++ b/src/Cursively/CsvTokenizer.cs
@@ -253,6 +253,9 @@ public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visi
                             // choose to treat all such quotes as just regular data.
                             _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
                             visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1));
+
+                            // let the visitor know that this was nonstandard.
+                            visitor.VisitNonstandardQuotedField();
                         }
                     }
                     else if (c == delimiter)
@@ -370,6 +373,9 @@ private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisi
                     _parserFlags |= ParserFlags.QuotedFieldDataEnded;
                     visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
                     visitor.VisitPartialFieldContents(readBuffer.Slice(idx + 1, 1));
+
+                    // let the visitor know that this was nonstandard.
+                    visitor.VisitNonstandardQuotedField();
                 }
 
                 // slice off the data up to the quote and the next byte that we read.
@@ -379,7 +385,9 @@ private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisi
             {
                 // this is expected to be rare: either we were cut between field reads, or we're
                 // reading nonstandard field data where there's a quote that neither starts nor ends
-                // the field.
+                // the field; by this point, we don't save enough state to remember which case we're
+                // in, so VisitNonstandardQuotedField **MUST** have been correctly called (or not)
+                // before entering this section.
                 for (int idx = 0; idx < readBuffer.Length; idx++)
                 {
                     byte b = readBuffer[idx];
@@ -416,7 +424,9 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan<by
             // the minimum amount that we need to do in order to clear this flag and get back into
             // the normal swing of things.
             _parserFlags &= ~ParserFlags.CutAtPotentiallyTerminalDoubleQuote;
-            if (readBuffer[0] == QUOTE)
+
+            byte c = readBuffer[0];
+            if (c == QUOTE)
             {
                 // the previous double quote was actually there to escape this double quote.  we
                 // didn't visit the double-quote last time because we weren't sure.  well, we're
@@ -432,6 +442,14 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan<by
                 // data, and so all we need to do is set this flag..  main loop will re-process this
                 // buffer and go about its merry way.
                 _parserFlags |= ParserFlags.QuotedFieldDataEnded;
+
+                if (c != _delimiter && c != CR && c != LF)
+                {
+                    // let the visitor know that this was nonstandard, since this is our last
+                    // opportunity to do so before our state machine can longer distinguish between
+                    // the current state and the state for a standard field that spans chunks.
+                    visitor.VisitNonstandardQuotedField();
+                }
             }
         }
 

From 29a4bf4fd496620ec5f50904343e9d0c64bbed57 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Thu, 13 Jun 2019 09:50:01 -0400
Subject: [PATCH 12/22] fix and improve xmldoc

---
 src/Cursively/CsvReaderVisitorBase.cs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs
index 0a748fc..a0118aa 100644
--- a/src/Cursively/CsvReaderVisitorBase.cs
+++ b/src/Cursively/CsvReaderVisitorBase.cs
@@ -71,7 +71,8 @@ public abstract class CsvReaderVisitorBase
         /// This method may be called at any time.
         /// </para>
         /// <para>
-        /// Any method, including this one, may be called directly after a call to this method.
+        /// Any method except <see cref="VisitNonstandardQuotedField"/>, including this one, may be
+        /// called directly after a call to this method.
         /// </para>
         /// <para>
         /// This method may be called without a preceding <see cref="VisitPartialFieldContents"/>
@@ -109,8 +110,8 @@ public abstract class CsvReaderVisitorBase
         /// <remarks>
         /// <para>
         /// This method may only be called after a call to <see cref="VisitPartialFieldContents"/>,
-        /// at most once per field (i.e., once it is called, it may not be called again until the
-        /// next call to <see cref="VisitEndOfField"/>).
+        /// at most once per field (i.e., once it is called, it may not be called again until after
+        /// the next time that <see cref="VisitEndOfField"/> is called).
         /// </para>
         /// <para>
         /// Only <see cref="VisitPartialFieldContents"/> and <see cref="VisitEndOfField"/> may be

From 4f40606bf7d92fb137048ff83f004ead26d0c617 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Thu, 13 Jun 2019 09:53:37 -0400
Subject: [PATCH 13/22] improve xmldoc

---
 src/Cursively/CsvReaderVisitorBase.cs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs
index a0118aa..09f25bb 100644
--- a/src/Cursively/CsvReaderVisitorBase.cs
+++ b/src/Cursively/CsvReaderVisitorBase.cs
@@ -109,9 +109,10 @@ public abstract class CsvReaderVisitorBase
         /// </summary>
         /// <remarks>
         /// <para>
-        /// This method may only be called after a call to <see cref="VisitPartialFieldContents"/>,
-        /// at most once per field (i.e., once it is called, it may not be called again until after
-        /// the next time that <see cref="VisitEndOfField"/> is called).
+        /// This method may only be called as the very next method that gets called after a call to
+        /// <see cref="VisitPartialFieldContents"/>, and only at most once per field (i.e., once it
+        /// is called, it may not be called again until a <see cref="VisitEndOfField"/> call brings
+        /// the tokenizer back to a state where RFC 4180 rules are expected).
         /// </para>
         /// <para>
         /// Only <see cref="VisitPartialFieldContents"/> and <see cref="VisitEndOfField"/> may be

From 39cf1d0c8a1ea26293ec49eb66f3bbb212bdece9 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Fri, 14 Jun 2019 08:52:50 -0400
Subject: [PATCH 14/22] Add nonstandard field test

---
 src/Cursively/CsvReaderVisitorBase.cs     | 14 +++--
 test/Cursively.Tests/CsvTokenizerTests.cs | 72 +++++++++++++++++++++--
 2 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs
index 09f25bb..d2a5410 100644
--- a/src/Cursively/CsvReaderVisitorBase.cs
+++ b/src/Cursively/CsvReaderVisitorBase.cs
@@ -119,11 +119,15 @@ public abstract class CsvReaderVisitorBase
         /// called directly after a call to this method.
         /// </para>
         /// <para>
-        /// Once called, the entire field described by all preceding consecutive calls to
-        /// <see cref="VisitPartialFieldContents"/> calls, and all successive calls up to the next
-        /// <see cref="VisitEndOfField"/>, are considered to be "nonstandard".  That means that this
-        /// method may be considered to affect the correctness of previous method calls, depending
-        /// on the semantics of the override.
+        /// The last byte in the preceding <see cref="VisitPartialFieldContents"/> call's chunk will
+        /// be the specific byte that was unexpected; all bytes before it were legal under RFC 4180.
+        /// So if this event is being raised because the tokenizer found a double-quote in a field
+        /// that did not start with a double-quote, then <see cref="VisitPartialFieldContents"/> was
+        /// previously called with a chunk that ended with that double-quote.  If it's being raised
+        /// because a double-quote was found in a quoted field that was not immediately followed by
+        /// a double-quote, delimiter, or line ending, then <see cref="VisitPartialFieldContents"/>
+        /// was previously called with a chunk that ended with whichever byte immediately followed
+        /// the double-quote that ended the quoted part of the quoted field data.
         /// </para>
         /// </remarks>
         public virtual void VisitNonstandardQuotedField() { }
diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs
index 76a5717..add2793 100644
--- a/test/Cursively.Tests/CsvTokenizerTests.cs
+++ b/test/Cursively.Tests/CsvTokenizerTests.cs
@@ -89,13 +89,36 @@ public void MemoryMappedCsvShouldMatchCsvHelper(string filePath)
 
             // act
             Csv.ProcessEntireFile(filePath, visitor);
-            var actual = visitor.Lines;
+            var actual = visitor.Records;
 
             // assert
             var expected = TokenizeCsvFileUsingCsvHelper(File.ReadAllBytes(filePath), ",");
             Assert.Equal(expected, actual);
         }
 
+        [Fact]
+        public void NonstandardQuotedFieldsShouldNotify()
+        {
+            // arrange
+            string csvFilePath = Path.Combine(TestCsvFilesFolderPath, "nonstandard.csv");
+            var visitor = new NonstandardFieldVisitor(checked((int)new FileInfo(csvFilePath).Length));
+
+            // act
+            Csv.ProcessEntireFile(csvFilePath, visitor);
+
+            // assert
+            string[] expectedContentsBeforeNonstandardFields =
+            {
+                "hello ",
+                "hello ",
+                "good\"",
+                @"100% coverage, with the version of Roslyn shipped with the .NET Core 3.0 Preview 4 SDK version, is impossible...
+...unless I do something like making the byte immediately after this quoted field something with an ASCII value less than 13 that's not 10.
+Tab ('\t') has an ASCII value of 9, which is perfect for this.  so here's your tab:	",
+            };
+            Assert.Equal(expectedContentsBeforeNonstandardFields, visitor.ContentsBeforeNonstandardFields);
+        }
+
         private static List<string[]> TokenizeCsvFileUsingCursively(ReadOnlySpan<byte> fileData, int chunkLength, byte delimiter)
         {
             var tokenizer = new CsvTokenizer(delimiter);
@@ -108,7 +131,7 @@ private static List<string[]> TokenizeCsvFileUsingCursively(ReadOnlySpan<byte> f
 
             tokenizer.ProcessNextChunk(fileData, visitor);
             tokenizer.ProcessEndOfStream(visitor);
-            return visitor.Lines;
+            return visitor.Records;
         }
 
         private static IEnumerable<string[]> TokenizeCsvFileUsingCsvHelper(byte[] csvData, string delimiter)
@@ -186,11 +209,11 @@ private sealed class StringBufferingVisitor : CsvReaderVisitorBase
 
             public StringBufferingVisitor(int fileLength) => _cutBuffer = new byte[fileLength];
 
-            public List<string[]> Lines { get; } = new List<string[]>();
+            public List<string[]> Records { get; } = new List<string[]>();
 
             public override void VisitEndOfRecord()
             {
-                Lines.Add(_fields.ToArray());
+                Records.Add(_fields.ToArray());
                 _fields.Clear();
             }
 
@@ -214,5 +237,46 @@ private void CopyToCutBuffer(ReadOnlySpan<byte> chunk)
                 _cutBufferConsumed += chunk.Length;
             }
         }
+
+        private sealed class NonstandardFieldVisitor : CsvReaderVisitorBase
+        {
+            private readonly Decoder _decoder = new UTF8Encoding(false, true).GetDecoder();
+
+            private readonly char[] _fieldBuffer;
+
+            private int _fieldBufferConsumed;
+
+            public NonstandardFieldVisitor(int byteCount) =>
+                _fieldBuffer = new char[Encoding.UTF8.GetMaxCharCount(byteCount)];
+
+            public override void VisitEndOfField(ReadOnlySpan<byte> chunk)
+            {
+                VisitFieldContents(chunk, true);
+                _fieldBufferConsumed = 0;
+            }
+
+            public List<string> ContentsBeforeNonstandardFields { get; } = new List<string>();
+
+            public override void VisitEndOfRecord() { }
+
+            public override void VisitPartialFieldContents(ReadOnlySpan<byte> chunk) =>
+                VisitFieldContents(chunk, false);
+
+            public override void VisitNonstandardQuotedField()
+            {
+                VisitFieldContents(default, true);
+                ContentsBeforeNonstandardFields.Add(new string(_fieldBuffer, 0, _fieldBufferConsumed));
+            }
+
+            private void VisitFieldContents(ReadOnlySpan<byte> chunk, bool flush)
+            {
+                int cnt = _decoder.GetCharCount(chunk, flush);
+                if (cnt > 0)
+                {
+                    _decoder.GetChars(chunk, new Span<char>(_fieldBuffer, _fieldBufferConsumed, cnt), flush);
+                    _fieldBufferConsumed += cnt;
+                }
+            }
+        }
     }
 }

From 4a13e857ff75e0d0a966e8e20b723231c077619b Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Fri, 14 Jun 2019 09:55:49 -0400
Subject: [PATCH 15/22] Archive the benchmark files. They'll be decompressed at
 the start of the benchmark runs.

---
 .gitattributes                                |  2 +-
 test/Cursively.Benchmark/.gitignore           |  1 +
 test/Cursively.Benchmark/Program.cs           | 24 ++++++++++++++++---
 test/Cursively.Benchmark/large-csv-files.zip  |  3 +++
 .../100-huge-records-quoted.csv               |  3 ---
 .../large-csv-files/100-huge-records.csv      |  3 ---
 .../large-csv-files/10k-empty-records.csv     |  3 ---
 .../large-csv-files/mocked.csv                |  3 ---
 .../large-csv-files/worldcitiespop.csv        |  3 ---
 9 files changed, 26 insertions(+), 19 deletions(-)
 create mode 100644 test/Cursively.Benchmark/.gitignore
 create mode 100644 test/Cursively.Benchmark/large-csv-files.zip
 delete mode 100644 test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv
 delete mode 100644 test/Cursively.Benchmark/large-csv-files/100-huge-records.csv
 delete mode 100644 test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv
 delete mode 100644 test/Cursively.Benchmark/large-csv-files/mocked.csv
 delete mode 100644 test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv

diff --git a/.gitattributes b/.gitattributes
index a2ad6cb..cd8c663 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -57,4 +57,4 @@
 #*.PDF   diff=astextplain
 #*.rtf   diff=astextplain
 #*.RTF   diff=astextplain
-**/large-csv-files/** filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
diff --git a/test/Cursively.Benchmark/.gitignore b/test/Cursively.Benchmark/.gitignore
new file mode 100644
index 0000000..ff63206
--- /dev/null
+++ b/test/Cursively.Benchmark/.gitignore
@@ -0,0 +1 @@
+large-csv-files/*.csv
diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs
index 943359a..3da0da3 100644
--- a/test/Cursively.Benchmark/Program.cs
+++ b/test/Cursively.Benchmark/Program.cs
@@ -1,5 +1,6 @@
 ﻿using System;
 using System.IO;
+using System.IO.Compression;
 using System.Runtime.CompilerServices;
 using System.Text;
 
@@ -79,9 +80,26 @@ public CsvFile(string fullPath) =>
             public override string ToString() => FileName;
         }
 
-        private static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null) =>
-            Array.ConvertAll(Directory.GetFiles(Path.Combine(Path.GetDirectoryName(myLocation), "large-csv-files"), "*.csv"),
-                             fullPath => new CsvFile(fullPath));
+        private static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null)
+        {
+            string csvFileDirectoryPath = Path.Combine(Path.GetDirectoryName(myLocation), "large-csv-files");
+            if (!Directory.Exists(csvFileDirectoryPath))
+            {
+                string tmpDirectoryPath = csvFileDirectoryPath + "-tmp";
+                if (Directory.Exists(tmpDirectoryPath))
+                {
+                    Directory.Delete(tmpDirectoryPath, true);
+                }
+
+                string zipFilePath = csvFileDirectoryPath + ".zip";
+                Directory.CreateDirectory(tmpDirectoryPath);
+                ZipFile.ExtractToDirectory(zipFilePath, tmpDirectoryPath);
+                Directory.Move(tmpDirectoryPath, csvFileDirectoryPath);
+            }
+
+            return Array.ConvertAll(Directory.GetFiles(csvFileDirectoryPath, "*.csv"),
+                                    fullPath => new CsvFile(fullPath));
+        }
 
         private sealed class RowCountingVisitor : CsvReaderVisitorBase
         {
diff --git a/test/Cursively.Benchmark/large-csv-files.zip b/test/Cursively.Benchmark/large-csv-files.zip
new file mode 100644
index 0000000..2296075
--- /dev/null
+++ b/test/Cursively.Benchmark/large-csv-files.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc70c8d20921546b1fa4e587859a22a2edfce76325d8e5bc780b98a78409a76d
+size 47747942
diff --git a/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv b/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv
deleted file mode 100644
index 718947c..0000000
--- a/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:611a7ba4f69bf3ab34f1fbf3fbf4711bfa8fb91a210683bdf4c1915818f1cfe0
-size 4900444
diff --git a/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv b/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv
deleted file mode 100644
index fde3ed5..0000000
--- a/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3e82c977d84c24a6b16063b634cbeab1e8409b34724b0ecf07893f45f8aadb53
-size 2900444
diff --git a/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv b/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv
deleted file mode 100644
index 61dd063..0000000
--- a/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6f1e211bf4eb14ab578ccf6aff141e8db41e80314b39b85fba5f047830f746e4
-size 10020000
diff --git a/test/Cursively.Benchmark/large-csv-files/mocked.csv b/test/Cursively.Benchmark/large-csv-files/mocked.csv
deleted file mode 100644
index 4b45c74..0000000
--- a/test/Cursively.Benchmark/large-csv-files/mocked.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e01c74f0a9622e4ad72233ff35bfcc2663eca10b558d0d7e7f71932c6c981d4b
-size 12731500
diff --git a/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv b/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv
deleted file mode 100644
index 01dfbfe..0000000
--- a/test/Cursively.Benchmark/large-csv-files/worldcitiespop.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c4c7824338bbbc228f5d3e0089c57233136e83853821252ade7ed556b4bcfc1b
-size 151492068

From 27e153198b9d3c312f4a11a65dada212b11042b2 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Sat, 15 Jun 2019 07:52:43 -0400
Subject: [PATCH 16/22] Rename this back down.

---
 test/Cursively.Benchmark/Program.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs
index 3da0da3..c946b5a 100644
--- a/test/Cursively.Benchmark/Program.cs
+++ b/test/Cursively.Benchmark/Program.cs
@@ -22,7 +22,7 @@ public class Program
 
         [Benchmark(Baseline = true)]
         [ArgumentsSource(nameof(CsvFiles))]
-        public long CountRowsUsingCursivelyByteArray(CsvFile csvFile)
+        public long CountRowsUsingCursively(CsvFile csvFile)
         {
             var visitor = new RowCountingVisitor();
             var tokenizer = new CsvTokenizer();
@@ -54,7 +54,7 @@ private static int Main()
             var prog = new Program();
             foreach (var csvFile in CsvFiles)
             {
-                long rowCount = prog.CountRowsUsingCursivelyByteArray(csvFile);
+                long rowCount = prog.CountRowsUsingCursively(csvFile);
                 if (prog.CountRowsUsingCsvHelper(csvFile) != rowCount)
                 {
                     Console.Error.WriteLine($"Failed on {csvFile}.");

From 918e3060233a9daa1e6338ddc4fc5bf1be31f564 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Sat, 15 Jun 2019 10:37:27 -0400
Subject: [PATCH 17/22] Reorder these checks.

---
 src/Cursively/CsvTokenizer.cs | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs
index c04ade4..61955cc 100644
--- a/src/Cursively/CsvTokenizer.cs
+++ b/src/Cursively/CsvTokenizer.cs
@@ -240,7 +240,16 @@ public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visi
                 for (int idx = 0; idx < chunk.Length; idx++)
                 {
                     byte c = chunk[idx];
-                    if (c == QUOTE)
+                    if (c == delimiter)
+                    {
+                        _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
+                        visitor.VisitEndOfField(chunk.Slice(0, idx));
+                    }
+                    else if (c == CR || c == LF)
+                    {
+                        ProcessEndOfLine(chunk.Slice(0, idx), visitor);
+                    }
+                    else if (c == QUOTE)
                     {
                         if (idx == 0)
                         {
@@ -258,15 +267,6 @@ public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visi
                             visitor.VisitNonstandardQuotedField();
                         }
                     }
-                    else if (c == delimiter)
-                    {
-                        _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
-                        visitor.VisitEndOfField(chunk.Slice(0, idx));
-                    }
-                    else if (c == CR || c == LF)
-                    {
-                        ProcessEndOfLine(chunk.Slice(0, idx), visitor);
-                    }
                     else
                     {
                         continue;
@@ -311,12 +311,6 @@ public void ProcessEndOfStream(CsvReaderVisitorBase visitor)
 
         private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisitorBase visitor)
         {
-            if ((_parserFlags & ParserFlags.CutAtPotentiallyTerminalDoubleQuote) != 0)
-            {
-                HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref readBuffer, visitor);
-                return;
-            }
-
             if ((_parserFlags & (ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.QuotedFieldDataEnded)) == ParserFlags.CurrentFieldStartedWithQuote)
             {
                 int idx = readBuffer.IndexOf(QUOTE);
@@ -388,6 +382,12 @@ private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisi
                 // the field; by this point, we don't save enough state to remember which case we're
                 // in, so VisitNonstandardQuotedField **MUST** have been correctly called (or not)
                 // before entering this section.
+                if ((_parserFlags & ParserFlags.CutAtPotentiallyTerminalDoubleQuote) != 0)
+                {
+                    HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref readBuffer, visitor);
+                    return;
+                }
+
                 for (int idx = 0; idx < readBuffer.Length; idx++)
                 {
                     byte b = readBuffer[idx];

From 2f1f7cc4c19a9286bf2dfa0b3db8af2adc40e1aa Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Sat, 15 Jun 2019 10:42:21 -0400
Subject: [PATCH 18/22] Fix an oversight in the previous commit. Fields cut at
 potentially terminal double-quotes are, of course, quoted.

---
 src/Cursively/CsvTokenizer.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs
index 61955cc..ce80a4d 100644
--- a/src/Cursively/CsvTokenizer.cs
+++ b/src/Cursively/CsvTokenizer.cs
@@ -311,7 +311,7 @@ public void ProcessEndOfStream(CsvReaderVisitorBase visitor)
 
         private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisitorBase visitor)
         {
-            if ((_parserFlags & (ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.QuotedFieldDataEnded)) == ParserFlags.CurrentFieldStartedWithQuote)
+            if ((_parserFlags & (ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.QuotedFieldDataEnded | ParserFlags.CutAtPotentiallyTerminalDoubleQuote)) == ParserFlags.CurrentFieldStartedWithQuote)
             {
                 int idx = readBuffer.IndexOf(QUOTE);
                 if (idx < 0)

From 3c73bd040859492dd3d46c97d12fed1eeb103d61 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Sat, 15 Jun 2019 14:32:56 -0400
Subject: [PATCH 19/22] Add first-class support for headered CSV streams.

This support comes in the form of an intermediate visitor base class that has multiple built-in protections designed to help thwart DDoS attacks.

Resolves #5
---
 .../CsvReaderVisitorWithUTF8HeadersBase.cs    | 471 ++++++++++++++++++
 src/Cursively/CsvTokenizer.cs                 |  10 +-
 src/Cursively/Cursively.csproj                |   1 +
 src/Cursively/CursivelyDataStreamException.cs |  31 ++
 .../CursivelyDecoderExceptionFallback.cs      |  42 ++
 .../CursivelyExtraDataFieldsException.cs      |  25 +
 .../CursivelyHeaderIsTooLongException.cs      |  25 +
 .../CursivelyHeadersAreNotUTF8Exception.cs    |  32 ++
 .../CursivelyMissingDataFieldsException.cs    |  25 +
 .../CursivelyTooManyHeadersException.cs       |  25 +
 test/Cursively.Tests/CsvTokenizerTests.cs     | 140 +++++-
 test/Cursively.Tests/Cursively.Tests.csproj   |   2 +-
 .../invalid/invalid-utf8-in-header.csv        |   2 +
 .../invalid/missing-data-fields.csv           |   3 +
 .../invalid/too-many-data-fields.csv          |   2 +
 .../valid/invalid-utf8-outside-header.csv     |   2 +
 .../with-headers/valid/simple.csv             |   3 +
 17 files changed, 825 insertions(+), 16 deletions(-)
 create mode 100644 src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs
 create mode 100644 src/Cursively/CursivelyDataStreamException.cs
 create mode 100644 src/Cursively/CursivelyDecoderExceptionFallback.cs
 create mode 100644 src/Cursively/CursivelyExtraDataFieldsException.cs
 create mode 100644 src/Cursively/CursivelyHeaderIsTooLongException.cs
 create mode 100644 src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs
 create mode 100644 src/Cursively/CursivelyMissingDataFieldsException.cs
 create mode 100644 src/Cursively/CursivelyTooManyHeadersException.cs
 create mode 100644 test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv
 create mode 100644 test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv
 create mode 100644 test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv
 create mode 100644 test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv
 create mode 100644 test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv

diff --git a/src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs b/src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs
new file mode 100644
index 0000000..dae9ca4
--- /dev/null
+++ b/src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs
@@ -0,0 +1,471 @@
+﻿using System;
+using System.Collections.Immutable;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Text;
+
+namespace Cursively
+{
+    /// <summary>
+    /// <para>
+    /// Intermediate base class for CSV reader visitors that don't want to have to implement header
+    /// handling by themselves.
+    /// </para>
+    /// <para>
+    /// Instances of this class are tied to a single CSV stream and cannot be reused or reset for
+    /// use with other CSV streams.
+    /// </para>
+    /// <para>
+    /// Each instance of this visitor has an upper-bound on the maximum number of headers and on the
+    /// maximum length of each header.  CSV streams that exceed these limits will cause this class
+    /// to throw exceptions, and behavior of a particular instance is undefined once this happens.
+    /// </para>
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// The following input-dependent exceptions may get thrown when using this visitor, all of
+    /// which inherit from <see cref="CursivelyDataStreamException"/>:
+    /// </para>
+    /// <list type="bullet">
+    /// <item>
+    /// <description>
+    /// <see cref="CursivelyHeadersAreNotUTF8Exception"/> if <see cref="DefaultDecoderFallback"/> is
+    /// being used and the CSV stream contains a sequence of invalid UTF-8 bytes.
+    /// </description>
+    /// </item>
+    /// <item>
+    /// <description>
+    /// <see cref="CursivelyHeaderIsTooLongException"/> if the CSV stream contains one or more
+    /// headers that are longer than the configured maximum.
+    /// </description>
+    /// </item>
+    /// <item>
+    /// <description>
+    /// <see cref="CursivelyTooManyHeadersException"/> if the CSV stream contains more headers than
+    /// the configured maximum.
+    /// </description>
+    /// </item>
+    /// <item>
+    /// <description>
+    /// <see cref="CursivelyMissingDataFieldsException"/>, by default, if a data record contains more
+    /// fields than the header record.
+    /// </description>
+    /// </item>
+    /// <item>
+    /// <description>
+    /// <see cref="CursivelyExtraDataFieldsException"/>, by default, if a data record contains more
+    /// fields than the header record.
+    /// </description>
+    /// </item>
+    /// </list>
+    /// </remarks>
+    public abstract class CsvReaderVisitorWithUTF8HeadersBase : CsvReaderVisitorBase
+    {
+        /// <summary>
+        /// The value used by <see cref="CsvReaderVisitorWithUTF8HeadersBase()"/> to initialize the
+        /// maximum number of headers (1,000).
+        /// </summary>
+        protected static readonly int DefaultMaxHeaderCount = 1_000;
+
+        /// <summary>
+        /// The value used by <see cref="CsvReaderVisitorWithUTF8HeadersBase()"/> to initialize the
+        /// maximum length, in UTF-16 code units, of a single header (100).
+        /// </summary>
+        protected static readonly int DefaultMaxHeaderLength = 100;
+
+        /// <summary>
+        /// The value used by <see cref="CsvReaderVisitorWithUTF8HeadersBase()"/> to initialize the
+        /// value indicating whether or not to ignore a leading UTF-8 BOM (true).
+        /// </summary>
+        protected static readonly bool DefaultIgnoreUTF8IdentifierOnFirstHeaderField = true;
+
+        /// <summary>
+        /// The value used by <see cref="CsvReaderVisitorWithUTF8HeadersBase()"/> to initialize the
+        /// fallback logic when the decoder encounters invalid UTF-8 bytes (throw an exception).
+        /// </summary>
+        protected static readonly DecoderFallback DefaultDecoderFallback = new CursivelyDecoderExceptionFallback();
+
+        private static readonly UTF8Encoding EncodingToUse = new UTF8Encoding(false, false);
+
+        private readonly Decoder _headerDecoder;
+
+        private readonly ImmutableArray<string>.Builder _headersBuilder;
+
+        private readonly bool _ignoreUTF8IdentifierOnFirstHeaderField;
+
+        private char[] _headerBuffer;
+
+        private ImmutableArray<string> _headers;
+
+        private int _headerBufferConsumed;
+
+        private int _currentFieldIndex = -1;
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="CsvReaderVisitorWithUTF8HeadersBase"/> class.
+        /// </summary>
+        protected CsvReaderVisitorWithUTF8HeadersBase()
+            : this(maxHeaderCount: DefaultMaxHeaderCount,
+                   maxHeaderLength: DefaultMaxHeaderLength,
+                   ignoreUTF8IdentifierOnFirstHeaderField: DefaultIgnoreUTF8IdentifierOnFirstHeaderField,
+                   decoderFallback: DefaultDecoderFallback)
+        {
+        }
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="CsvReaderVisitorWithUTF8HeadersBase"/> class.
+        /// </summary>
+        /// <param name="maxHeaderCount">
+        /// The maximum number of headers to allow.
+        /// Default: <see cref="DefaultMaxHeaderCount"/>.
+        /// </param>
+        /// <param name="maxHeaderLength">
+        /// The maximum length, in UTF-16 code units, of any particular header.
+        /// Default: <see cref="DefaultMaxHeaderLength"/>.
+        /// </param>
+        /// <param name="ignoreUTF8IdentifierOnFirstHeaderField">
+        /// A value indicating whether or not to ignore a leading UTF-8 BOM.
+        /// Default: <see cref="DefaultIgnoreUTF8IdentifierOnFirstHeaderField"/>.
+        /// </param>
+        /// <param name="decoderFallback">
+        /// The fallback logic used when the decoder encounters invalid UTF-8 bytes.
+        /// Default: <see cref="DefaultDecoderFallback"/>.
+        /// </param>
+        /// <exception cref="ArgumentNullException">
+        /// Thrown when <paramref name="decoderFallback"/> is <see langword="null"/>.
+        /// </exception>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// Thrown when <paramref name="maxHeaderCount"/> or <paramref name="maxHeaderLength"/> is
+        /// less than 1.
+        /// </exception>
+        protected CsvReaderVisitorWithUTF8HeadersBase(int maxHeaderCount, int maxHeaderLength, bool ignoreUTF8IdentifierOnFirstHeaderField, DecoderFallback decoderFallback)
+        {
+            if (maxHeaderCount < 1)
+            {
+                throw new ArgumentOutOfRangeException(nameof(maxHeaderCount), maxHeaderCount, "Must be greater than zero.");
+            }
+
+            if (maxHeaderLength < 1)
+            {
+                throw new ArgumentOutOfRangeException(nameof(maxHeaderLength), maxHeaderLength, "Must be greater than zero.");
+            }
+
+            if (decoderFallback is null)
+            {
+                throw new ArgumentNullException(nameof(decoderFallback));
+            }
+
+            _ignoreUTF8IdentifierOnFirstHeaderField = ignoreUTF8IdentifierOnFirstHeaderField;
+
+            _headersBuilder = ImmutableArray.CreateBuilder<string>(maxHeaderCount);
+
+            _headerBuffer = new char[maxHeaderLength];
+
+            _headerDecoder = EncodingToUse.GetDecoder();
+            _headerDecoder.Fallback = decoderFallback;
+        }
+
+        /// <summary>
+        /// <para>
+        /// Gets the headers of the CSV stream.
+        /// </para>
+        /// <para>
+        /// Only valid after <see cref="VisitEndOfHeaderRecord"/> has been called.
+        /// </para>
+        /// </summary>
+        /// <exception cref="InvalidOperationException">
+        /// Thrown when trying to access this value before <see cref="VisitEndOfHeaderRecord"/> has
+        /// been called.
+        /// </exception>
+        /// <remarks>
+        /// Once initialized, the value will remain the same for as long as this object instance
+        /// stays alive.
+        /// </remarks>
+        protected ImmutableArray<string> Headers
+        {
+            get
+            {
+                if (_headers.IsDefault)
+                {
+                    ThrowExceptionWhenHeadersAreStillBeingBuilt();
+                }
+
+                return _headers;
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        private static void ThrowExceptionWhenHeadersAreStillBeingBuilt() =>
+            throw new InvalidOperationException("Headers are still being built.");
+
+        /// <summary>
+        /// Gets the zero-based index of the field that is currently being read.  The value should
+        /// be the length of <see cref="Headers"/> during <see cref="VisitEndOfHeaderRecord"/> and
+        /// <see cref="VisitEndOfDataRecord"/>, except after <see cref="VisitMissingDataFields"/> or
+        /// <see cref="VisitUnexpectedDataField"/> has been called.
+        /// </summary>
+        protected int CurrentFieldIndex => _currentFieldIndex;
+
+        /// <inheritdoc />
+        public sealed override void VisitPartialFieldContents(ReadOnlySpan<byte> chunk)
+        {
+            if (_headers.IsDefault || _currentFieldIndex >= _headers.Length)
+            {
+                VisitPartialFieldContentsSlow(chunk);
+            }
+            else
+            {
+                VisitPartialDataFieldContents(chunk);
+            }
+        }
+
+        /// <inheritdoc />
+        public sealed override void VisitEndOfField(ReadOnlySpan<byte> chunk)
+        {
+            if (_headers.IsDefault || _currentFieldIndex >= _headers.Length)
+            {
+                VisitEndOfFieldSlow(chunk);
+            }
+            else
+            {
+                VisitEndOfDataField(chunk);
+                ++_currentFieldIndex;
+            }
+        }
+
+        /// <inheritdoc />
+        public sealed override void VisitEndOfRecord()
+        {
+            if (_headers.IsDefault || _currentFieldIndex != _headers.Length)
+            {
+                VisitEndOfRecordSlow();
+            }
+            else
+            {
+                VisitEndOfDataRecord();
+                _currentFieldIndex = 0;
+            }
+        }
+
+        /// <summary>
+        /// <para>
+        /// Notifies that all headers have been read and <see cref="Headers"/> is safe to read.
+        /// </para>
+        /// <para>
+        /// The default behavior is to do nothing.
+        /// </para>
+        /// </summary>
+        protected virtual void VisitEndOfHeaderRecord() { }
+
+        /// <summary>
+        /// Visits part of a non-header field's data.
+        /// </summary>
+        /// <param name="chunk">
+        /// The data from this part of the field.
+        /// </param>
+        /// <remarks>
+        /// See documentation for <see cref="CsvReaderVisitorBase.VisitPartialFieldContents"/> for
+        /// details about when and how this method will be called.
+        /// </remarks>
+        protected abstract void VisitPartialDataFieldContents(ReadOnlySpan<byte> chunk);
+
+        /// <summary>
+        /// Visits the last part of a non-header field's data.
+        /// </summary>
+        /// <param name="chunk">
+        /// The data from the last part of the field.
+        /// </param>
+        /// <remarks>
+        /// See documentation for <see cref="CsvReaderVisitorBase.VisitEndOfField"/> for
+        /// details about when and how this method will be called.
+        /// </remarks>
+        protected abstract void VisitEndOfDataField(ReadOnlySpan<byte> chunk);
+
+        /// <summary>
+        /// Notifies that all fields in the current non-header record have been visited.
+        /// </summary>
+        /// <remarks>
+        /// See documentation for <see cref="CsvReaderVisitorBase.VisitEndOfRecord"/> for
+        /// details about when and how this method will be called.
+        /// </remarks>
+        protected abstract void VisitEndOfDataRecord();
+
+        /// <summary>
+        /// <para>
+        /// Notifies that the current non-header record is about to be terminated without reading
+        /// all the fields that were identified in the header record.
+        /// </para>
+        /// <para>
+        /// The default behavior is to throw <see cref="CursivelyMissingDataFieldsException"/>.
+        /// </para>
+        /// </summary>
+        protected virtual void VisitMissingDataFields()
+        {
+            if (_headers.IsDefault)
+            {
+                // we will never do this, but a cheeky subclass might.
+                throw new InvalidOperationException("This method is only intended to be called by the base class.");
+            }
+
+            throw new CursivelyMissingDataFieldsException(_headers.Length, _currentFieldIndex);
+        }
+
+        /// <summary>
+        /// <para>
+        /// Notifies that data for a field is about to be read on a non-header record, but all the
+        /// fields that were identified in the header record have already been read.
+        /// </para>
+        /// <para>
+        /// This method is called before every single <see cref="VisitPartialDataFieldContents"/> or
+        /// <see cref="VisitEndOfDataField"/> call for fields not present in the header record.
+        /// </para>
+        /// <para>
+        /// The default behavior is to throw <see cref="CursivelyExtraDataFieldsException"/>.
+        /// </para>
+        /// </summary>
+        protected virtual void VisitUnexpectedDataField()
+        {
+            if (_headers.IsDefault)
+            {
+                // we will never do this, but a cheeky subclass might.
+                throw new InvalidOperationException("This method is only intended to be called by the base class.");
+            }
+
+            throw new CursivelyExtraDataFieldsException(_headers.Length);
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        private unsafe void VisitPartialFieldContentsSlow(ReadOnlySpan<byte> chunk)
+        {
+            if (_headers.IsDefault)
+            {
+                if (_headersBuilder.Capacity == _headersBuilder.Count)
+                {
+                    throw new CursivelyTooManyHeadersException(_headersBuilder.Capacity);
+                }
+
+                if (chunk.IsEmpty)
+                {
+                    // the tokenizer will never do this, but an external caller might.
+                    return;
+                }
+
+                fixed (byte* b = &chunk[0])
+                {
+                    VisitHeaderChunk(b, chunk.Length, false);
+                }
+            }
+            else
+            {
+                Debug.Assert(_currentFieldIndex >= _headers.Length, "Another condition brought us into VisitPartialFieldContentsSlow without updating this bit.");
+                VisitUnexpectedDataField();
+                VisitPartialDataFieldContents(chunk);
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        private unsafe void VisitEndOfFieldSlow(ReadOnlySpan<byte> chunk)
+        {
+            if (_headers.IsDefault)
+            {
+                if (_headersBuilder.Capacity == _headersBuilder.Count)
+                {
+                    throw new CursivelyTooManyHeadersException(_headersBuilder.Capacity);
+                }
+
+                if (chunk.IsEmpty)
+                {
+                    // the tokenizer will never do this, but an external caller might.  note that
+                    // the Decoder methods require a non-null pointer, even if the length is zero.
+                    byte b = 0xFF;
+                    VisitHeaderChunk(&b, 0, true);
+                }
+                else
+                {
+                    fixed (byte* b = &chunk[0])
+                    {
+                        VisitHeaderChunk(b, chunk.Length, true);
+                    }
+                }
+
+                int headerBufferOffset = 0;
+
+                if (_headersBuilder.Count == 0 &&
+                    _ignoreUTF8IdentifierOnFirstHeaderField &&
+                    _headerBufferConsumed > 0 &&
+                    _headerBuffer[0] == '\uFEFF')
+                {
+                    headerBufferOffset = 1;
+                }
+
+                _headersBuilder.Add(new string(_headerBuffer, headerBufferOffset, _headerBufferConsumed - headerBufferOffset));
+                _headerBufferConsumed = 0;
+                ++_currentFieldIndex;
+            }
+            else
+            {
+                Debug.Assert(_currentFieldIndex >= _headers.Length, "Another condition brought us into VisitEndOfFieldSlow without updating this bit.");
+                VisitUnexpectedDataField();
+                VisitEndOfDataField(chunk);
+                _currentFieldIndex = checked(_currentFieldIndex + 1);
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        private void VisitEndOfRecordSlow()
+        {
+            if (_headers.IsDefault)
+            {
+                if (_headersBuilder.Count == 0)
+                {
+                    // the tokenizer will never do this, but an external caller might.
+                    throw new InvalidOperationException("No fields were present in the header record.");
+                }
+
+                // this is almost equivalent to setting _headers = _headersBuilder.ToImmutable(),
+                // but this does a better job rewarding people for setting the max field count to
+                // the actual field count, which will often be the case.
+                _headersBuilder.Capacity = _headersBuilder.Count;
+                _headers = _headersBuilder.MoveToImmutable();
+                _currentFieldIndex = _headers.Length;
+
+                // we're done building headers, so free up our buffer.
+                _headerBuffer = null;
+
+                // let the subclass know that the headers are ready, in case it wants to set up some
+                // stuff before the field data starts rolling in.
+                VisitEndOfHeaderRecord();
+            }
+            else
+            {
+                Debug.Assert(_currentFieldIndex != _headers.Length, "Another condition brought us into VisitEndOfRecordSlow without updating this bit.");
+                if (_currentFieldIndex < _headers.Length)
+                {
+                    VisitMissingDataFields();
+                }
+
+                VisitEndOfDataRecord();
+            }
+
+            _currentFieldIndex = 0;
+        }
+
+        private unsafe void VisitHeaderChunk(byte* b, int byteCount, bool flush)
+        {
+            int charCount = _headerDecoder.GetCharCount(b, byteCount, flush);
+            if (_headerBufferConsumed + charCount <= _headerBuffer.Length)
+            {
+                fixed (char* c = &_headerBuffer[_headerBufferConsumed])
+                {
+                    _headerDecoder.GetChars(b, byteCount, c, charCount, flush);
+                }
+            }
+            else
+            {
+                throw new CursivelyHeaderIsTooLongException(_headerBuffer.Length);
+            }
+
+            _headerBufferConsumed += charCount;
+        }
+    }
+}
diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs
index ce80a4d..0595009 100644
--- a/src/Cursively/CsvTokenizer.cs
+++ b/src/Cursively/CsvTokenizer.cs
@@ -247,7 +247,7 @@ public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visi
                     }
                     else if (c == CR || c == LF)
                     {
-                        ProcessEndOfLine(chunk.Slice(0, idx), visitor);
+                        ProcessEndOfRecord(chunk.Slice(0, idx), visitor);
                     }
                     else if (c == QUOTE)
                     {
@@ -306,7 +306,7 @@ public void ProcessEndOfStream(CsvReaderVisitorBase visitor)
                 visitor = CsvReaderVisitorBase.Null;
             }
 
-            ProcessEndOfLine(default, visitor);
+            ProcessEndOfRecord(default, visitor);
         }
 
         private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisitorBase visitor)
@@ -356,7 +356,7 @@ private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisi
                 {
                     // same thing as the delimiter case, just the field ended at the end of a line
                     // instead of the end of a field on the current line.
-                    ProcessEndOfLine(readBuffer.Slice(0, idx), visitor);
+                    ProcessEndOfRecord(readBuffer.Slice(0, idx), visitor);
                 }
                 else
                 {
@@ -398,7 +398,7 @@ private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisi
                     }
                     else if (b == CR || b == LF)
                     {
-                        ProcessEndOfLine(readBuffer.Slice(0, idx), visitor);
+                        ProcessEndOfRecord(readBuffer.Slice(0, idx), visitor);
                     }
                     else
                     {
@@ -453,7 +453,7 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan<by
             }
         }
 
-        private void ProcessEndOfLine(ReadOnlySpan<byte> lastFieldDataChunk, CsvReaderVisitorBase visitor)
+        private void ProcessEndOfRecord(ReadOnlySpan<byte> lastFieldDataChunk, CsvReaderVisitorBase visitor)
         {
             // even if the last field data chunk is empty, we still need to send it: we might be
             // looking at a newline that immediately follows a comma, which is defined to mean
diff --git a/src/Cursively/Cursively.csproj b/src/Cursively/Cursively.csproj
index 06e06e7..ebdabd9 100644
--- a/src/Cursively/Cursively.csproj
+++ b/src/Cursively/Cursively.csproj
@@ -16,6 +16,7 @@
   </PropertyGroup>
 
   <ItemGroup>
+    <PackageReference Include="System.Collections.Immutable" Version="1.5.0" />
     <PackageReference Include="System.Memory" Version="4.5.3" />
     <PackageReference Include="System.Threading.Tasks.Extensions" Version="4.5.2" />
   </ItemGroup>
diff --git a/src/Cursively/CursivelyDataStreamException.cs b/src/Cursively/CursivelyDataStreamException.cs
new file mode 100644
index 0000000..9926d37
--- /dev/null
+++ b/src/Cursively/CursivelyDataStreamException.cs
@@ -0,0 +1,31 @@
+﻿using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+
+namespace Cursively
+{
+    /// <summary>
+    /// Serves as the base class for exceptions thrown by this library to indicate problems with the
+    /// actual contents of a CSV stream.
+    /// </summary>
+    [Serializable]
+    [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+    public abstract class CursivelyDataStreamException : Exception
+    {
+        private protected CursivelyDataStreamException(string message)
+            : base(message)
+        {
+        }
+
+        private protected CursivelyDataStreamException(string message, Exception innerException)
+            : base(message, innerException)
+        {
+        }
+
+        [SuppressMessage("Microsoft.Usage", "CA2229:ImplementSerializationConstructors")]
+        private protected CursivelyDataStreamException(SerializationInfo info, StreamingContext context)
+            : base(info, context)
+        {
+        }
+    }
+}
diff --git a/src/Cursively/CursivelyDecoderExceptionFallback.cs b/src/Cursively/CursivelyDecoderExceptionFallback.cs
new file mode 100644
index 0000000..22192ea
--- /dev/null
+++ b/src/Cursively/CursivelyDecoderExceptionFallback.cs
@@ -0,0 +1,42 @@
+﻿using System.Text;
+
+namespace Cursively
+{
+    internal sealed class CursivelyDecoderExceptionFallback : DecoderFallback
+    {
+        public override int MaxCharCount => 0;
+
+        public override DecoderFallbackBuffer CreateFallbackBuffer() => new CursivelyDecoderExceptionFallbackBuffer();
+
+        public override bool Equals(object obj) => obj is CursivelyDecoderExceptionFallback;
+
+        public override int GetHashCode() => 1234;
+
+        private sealed class CursivelyDecoderExceptionFallbackBuffer : DecoderFallbackBuffer
+        {
+            public override int Remaining => 0;
+
+            public override char GetNextChar() => '\0';
+
+            public override bool MovePrevious() => false;
+
+            public override bool Fallback(byte[] bytesUnknown, int index)
+            {
+                // use the built-in logic to get a helpful exception message.
+                var inner = new DecoderExceptionFallbackBuffer();
+                try
+                {
+                    return inner.Fallback(bytesUnknown, index);
+                }
+                catch (DecoderFallbackException ex)
+                {
+                    // wrap it.  C# / .NET do not support multiple inheritance, and I think it's
+                    // more important for consumers to be able to catch CursivelyDataStreamException
+                    // for all exceptions in the form of "this breaks one of Cursively's rules, but
+                    // the system is otherwise operating normally".
+                    throw new CursivelyHeadersAreNotUTF8Exception(ex);
+                }
+            }
+        }
+    }
+}
diff --git a/src/Cursively/CursivelyExtraDataFieldsException.cs b/src/Cursively/CursivelyExtraDataFieldsException.cs
new file mode 100644
index 0000000..71996b1
--- /dev/null
+++ b/src/Cursively/CursivelyExtraDataFieldsException.cs
@@ -0,0 +1,25 @@
+﻿using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+
+namespace Cursively
+{
+    /// <summary>
+    /// Raised by <see cref="CsvReaderVisitorWithUTF8HeadersBase"/>, by default, when a data record
+    /// contains more fields than the header record.
+    /// </summary>
+    [Serializable]
+    [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+    public sealed class CursivelyExtraDataFieldsException : CursivelyDataStreamException
+    {
+        internal CursivelyExtraDataFieldsException(int headerFieldCount)
+            : base($"CSV stream contains a non-header record with more fields than the {headerFieldCount} field(s) present in the header record.")
+        {
+        }
+
+        private CursivelyExtraDataFieldsException(SerializationInfo info, StreamingContext context)
+            : base(info, context)
+        {
+        }
+    }
+}
diff --git a/src/Cursively/CursivelyHeaderIsTooLongException.cs b/src/Cursively/CursivelyHeaderIsTooLongException.cs
new file mode 100644
index 0000000..882098e
--- /dev/null
+++ b/src/Cursively/CursivelyHeaderIsTooLongException.cs
@@ -0,0 +1,25 @@
+﻿using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+
+namespace Cursively
+{
+    /// <summary>
+    /// Raised by <see cref="CsvReaderVisitorWithUTF8HeadersBase"/> when the length of a header
+    /// exceeds the configured maximum.
+    /// </summary>
+    [Serializable]
+    [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+    public sealed class CursivelyHeaderIsTooLongException : CursivelyDataStreamException
+    {
+        internal CursivelyHeaderIsTooLongException(int maxLength)
+            : base($"CSV stream contains a header that is longer than the configured max length of {maxLength}.")
+        {
+        }
+
+        private CursivelyHeaderIsTooLongException(SerializationInfo info, StreamingContext context)
+            : base(info, context)
+        {
+        }
+    }
+}
diff --git a/src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs b/src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs
new file mode 100644
index 0000000..283068b
--- /dev/null
+++ b/src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs
@@ -0,0 +1,32 @@
+﻿using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+using System.Text;
+
+namespace Cursively
+{
+    /// <summary>
+    /// Raised by <see cref="CsvReaderVisitorWithUTF8HeadersBase"/>, by default, when the header
+    /// record contains invalid UTF-8 bytes.
+    /// </summary>
+    [Serializable]
+    [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+    public sealed class CursivelyHeadersAreNotUTF8Exception : CursivelyDataStreamException
+    {
+        internal CursivelyHeadersAreNotUTF8Exception(DecoderFallbackException innerException)
+            : base(innerException.Message, innerException)
+        {
+        }
+
+        private CursivelyHeadersAreNotUTF8Exception(SerializationInfo info, StreamingContext context)
+            : base(info, context)
+        {
+        }
+
+        /// <summary>
+        /// Gets the <see cref="DecoderFallbackException"/> instance that holds the actual decoder
+        /// state when the current exception was raised.
+        /// </summary>
+        public DecoderFallbackException InnerDecoderFallbackException => (DecoderFallbackException)InnerException;
+    }
+}
diff --git a/src/Cursively/CursivelyMissingDataFieldsException.cs b/src/Cursively/CursivelyMissingDataFieldsException.cs
new file mode 100644
index 0000000..03c776d
--- /dev/null
+++ b/src/Cursively/CursivelyMissingDataFieldsException.cs
@@ -0,0 +1,25 @@
+﻿using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+
+namespace Cursively
+{
+    /// <summary>
+    /// Raised by <see cref="CsvReaderVisitorWithUTF8HeadersBase"/>, by default, when a data record
+    /// contains fewer fields than the header record.
+    /// </summary>
+    [Serializable]
+    [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+    public sealed class CursivelyMissingDataFieldsException : CursivelyDataStreamException
+    {
+        internal CursivelyMissingDataFieldsException(int headerFieldCount, int dataFieldCount)
+            : base($"CSV stream contains a non-header record with only {dataFieldCount} field(s), fewer than the {headerFieldCount} field(s) present in the header record.")
+        {
+        }
+
+        private CursivelyMissingDataFieldsException(SerializationInfo info, StreamingContext context)
+            : base(info, context)
+        {
+        }
+    }
+}
diff --git a/src/Cursively/CursivelyTooManyHeadersException.cs b/src/Cursively/CursivelyTooManyHeadersException.cs
new file mode 100644
index 0000000..ad5f876
--- /dev/null
+++ b/src/Cursively/CursivelyTooManyHeadersException.cs
@@ -0,0 +1,25 @@
+﻿using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+
+namespace Cursively
+{
+    /// <summary>
+    /// Raised by <see cref="CsvReaderVisitorWithUTF8HeadersBase"/> when the number of headers
+    /// exceeds the configured maximum.
+    /// </summary>
+    [Serializable]
+    [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+    public sealed class CursivelyTooManyHeadersException : CursivelyDataStreamException
+    {
+        internal CursivelyTooManyHeadersException(int maxHeaderCount)
+            : base($"CSV stream contains more headers than the configured maximum of {maxHeaderCount}.")
+        {
+        }
+
+        private CursivelyTooManyHeadersException(SerializationInfo info, StreamingContext context)
+            : base(info, context)
+        {
+        }
+    }
+}
diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs
index add2793..4095bd8 100644
--- a/test/Cursively.Tests/CsvTokenizerTests.cs
+++ b/test/Cursively.Tests/CsvTokenizerTests.cs
@@ -20,15 +20,12 @@ public sealed class CsvTokenizerTests
         private static readonly byte[] TestDelimiters = { (byte)',', (byte)'\t' };
 
         public static IEnumerable<object[]> TestCsvFiles =>
-            from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv")
+            from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv", SearchOption.AllDirectories)
             select new object[] { filePath };
 
-        public static IEnumerable<object[]> TestCsvFilesWithChunkLengthsAndDelimiters =>
-            from filePath in Directory.EnumerateFiles(TestCsvFilesFolderPath, "*.csv")
-            let fileName = Path.GetFileNameWithoutExtension(filePath)
-            from chunkLength in TestChunkLengths
-            from delimiter in TestDelimiters
-            select new object[] { fileName, chunkLength, delimiter };
+        public static IEnumerable<object[]> TestCsvFilesWithChunkLengthsAndDelimiters => GetTestCsvFilesWithChunkLengthsAndDelimiters();
+
+        public static IEnumerable<object[]> TestValidHeaderedCsvFilesWithChunkLengthsAndDelimiters => GetTestCsvFilesWithChunkLengthsAndDelimiters("with-headers", "valid");
 
         [Theory]
         [InlineData((byte)0x0A)]
@@ -56,10 +53,10 @@ public void NullVisitorShouldBeFine(string filePath)
 
         [Theory]
         [MemberData(nameof(TestCsvFilesWithChunkLengthsAndDelimiters))]
-        public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength, byte delimiter)
+        public void CsvTokenizationShouldMatchCsvHelper(string filePath, int chunkLength, byte delimiter)
         {
             // arrange
-            byte[] fileDataTemplate = File.ReadAllBytes(Path.Combine(TestCsvFilesFolderPath, fileName + ".csv"));
+            byte[] fileDataTemplate = File.ReadAllBytes(filePath);
             for (int i = 0; i < fileDataTemplate.Length; i++)
             {
                 if (fileDataTemplate[i] == (byte)',')
@@ -68,7 +65,7 @@ public void CsvTokenizationShouldMatchCsvHelper(string fileName, int chunkLength
                 }
             }
 
-            int randomSeed = HashCode.Combine(fileName, chunkLength, delimiter);
+            int randomSeed = HashCode.Combine(filePath, chunkLength, delimiter);
             foreach (byte[] fileData in VaryLineEndings(fileDataTemplate, randomSeed))
             {
                 // act
@@ -119,6 +116,62 @@ public void NonstandardQuotedFieldsShouldNotify()
             Assert.Equal(expectedContentsBeforeNonstandardFields, visitor.ContentsBeforeNonstandardFields);
         }
 
+        [Theory]
+        [MemberData(nameof(TestValidHeaderedCsvFilesWithChunkLengthsAndDelimiters))]
+        public void HeaderedCsvTokenizationShouldMatchCsvHelper(string filePath, int chunkLength, byte delimiter)
+        {
+            // arrange
+            byte[] fileDataTemplate = File.ReadAllBytes(filePath);
+            for (int i = 0; i < fileDataTemplate.Length; i++)
+            {
+                if (fileDataTemplate[i] == (byte)',')
+                {
+                    fileDataTemplate[i] = delimiter;
+                }
+            }
+
+            int randomSeed = HashCode.Combine(filePath, chunkLength, delimiter);
+            foreach (byte[] fileData in VaryLineEndings(fileDataTemplate, randomSeed))
+            {
+                // act
+                var actual = TokenizeHeaderedCsvFileUsingCursively(fileData, chunkLength, delimiter);
+
+                // assert
+                var expected = TokenizeCsvFileUsingCsvHelper(fileData, $"{(char)delimiter}");
+                Assert.Equal(expected, actual);
+            }
+        }
+
+        [Fact]
+        public void HeaderedCsvTokenizationShouldRejectTooManyDataFieldsByDefault()
+        {
+            // arrange
+            byte[] fileData = File.ReadAllBytes(Path.Combine(TestCsvFilesFolderPath, "with-headers", "invalid", "too-many-data-fields.csv"));
+
+            // act, assert
+            Assert.Throws<CursivelyExtraDataFieldsException>(() => TokenizeHeaderedCsvFileUsingCursively(fileData, fileData.Length, (byte)','));
+        }
+
+        [Fact]
+        public void HeaderedCsvTokenizationShouldRejectMissingDataFieldsByDefault()
+        {
+            // arrange
+            byte[] fileData = File.ReadAllBytes(Path.Combine(TestCsvFilesFolderPath, "with-headers", "invalid", "missing-data-fields.csv"));
+
+            // act, assert
+            Assert.Throws<CursivelyMissingDataFieldsException>(() => TokenizeHeaderedCsvFileUsingCursively(fileData, fileData.Length, (byte)','));
+        }
+
+        [Fact]
+        public void HeaderedCsvTokenizationShouldRejectInvalidUTF8ByDefault()
+        {
+            // arrange
+            byte[] fileData = File.ReadAllBytes(Path.Combine(TestCsvFilesFolderPath, "with-headers", "invalid", "invalid-utf8-in-header.csv"));
+
+            // act, assert
+            Assert.Throws<CursivelyHeadersAreNotUTF8Exception>(() => TokenizeHeaderedCsvFileUsingCursively(fileData, fileData.Length, (byte)','));
+        }
+
         private static List<string[]> TokenizeCsvFileUsingCursively(ReadOnlySpan<byte> fileData, int chunkLength, byte delimiter)
         {
             var tokenizer = new CsvTokenizer(delimiter);
@@ -134,6 +187,21 @@ private static List<string[]> TokenizeCsvFileUsingCursively(ReadOnlySpan<byte> f
             return visitor.Records;
         }
 
+        private static List<string[]> TokenizeHeaderedCsvFileUsingCursively(ReadOnlySpan<byte> fileData, int chunkLength, byte delimiter)
+        {
+            var tokenizer = new CsvTokenizer(delimiter);
+            var visitor = new HeaderedStringBufferingVisitor(fileData.Length);
+            while (fileData.Length >= chunkLength)
+            {
+                tokenizer.ProcessNextChunk(fileData.Slice(0, chunkLength), visitor);
+                fileData = fileData.Slice(chunkLength);
+            }
+
+            tokenizer.ProcessNextChunk(fileData, visitor);
+            tokenizer.ProcessEndOfStream(visitor);
+            return visitor.Records;
+        }
+
         private static IEnumerable<string[]> TokenizeCsvFileUsingCsvHelper(byte[] csvData, string delimiter)
         {
             using (var stream = new MemoryStream(csvData, false))
@@ -197,6 +265,12 @@ private static byte[][] VaryLineEndings(ReadOnlySpan<byte> fileData, int randomS
             return Array.ConvertAll(resultLists, lst => lst.ToArray());
         }
 
+        private static IEnumerable<object[]> GetTestCsvFilesWithChunkLengthsAndDelimiters(params string[] pathParts) =>
+            from filePath in Directory.EnumerateFiles(Path.Combine(TestCsvFilesFolderPath, Path.Combine(pathParts)), "*.csv", SearchOption.AllDirectories)
+            from chunkLength in TestChunkLengths
+            from delimiter in TestDelimiters
+            select new object[] { filePath, chunkLength, delimiter };
+
         private sealed class StringBufferingVisitor : CsvReaderVisitorBase
         {
             private static readonly UTF8Encoding TheEncoding = new UTF8Encoding(false, false);
@@ -278,5 +352,51 @@ private void VisitFieldContents(ReadOnlySpan<byte> chunk, bool flush)
                 }
             }
         }
+
+        private sealed class HeaderedStringBufferingVisitor : CsvReaderVisitorWithUTF8HeadersBase
+        {
+            private static readonly UTF8Encoding TheEncoding = new UTF8Encoding(false, false);
+
+            private readonly List<string> _fields = new List<string>();
+
+            private readonly byte[] _cutBuffer;
+
+            private int _cutBufferConsumed;
+
+            public HeaderedStringBufferingVisitor(int fileLength) => _cutBuffer = new byte[fileLength];
+
+            public List<string[]> Records { get; } = new List<string[]>();
+
+            protected override void VisitEndOfHeaderRecord()
+            {
+                Records.Insert(0, Headers.ToArray());
+            }
+
+            protected override void VisitEndOfDataRecord()
+            {
+                Records.Add(_fields.ToArray());
+                _fields.Clear();
+            }
+
+            protected override void VisitPartialDataFieldContents(ReadOnlySpan<byte> chunk) => CopyToCutBuffer(chunk);
+
+            protected override void VisitEndOfDataField(ReadOnlySpan<byte> chunk)
+            {
+                if (_cutBufferConsumed != 0)
+                {
+                    CopyToCutBuffer(chunk);
+                    chunk = new ReadOnlySpan<byte>(_cutBuffer, 0, _cutBufferConsumed);
+                }
+
+                _fields.Add(TheEncoding.GetString(chunk));
+                _cutBufferConsumed = 0;
+            }
+
+            private void CopyToCutBuffer(ReadOnlySpan<byte> chunk)
+            {
+                chunk.CopyTo(new Span<byte>(_cutBuffer, _cutBufferConsumed, chunk.Length));
+                _cutBufferConsumed += chunk.Length;
+            }
+        }
     }
 }
diff --git a/test/Cursively.Tests/Cursively.Tests.csproj b/test/Cursively.Tests/Cursively.Tests.csproj
index ddb7268..a969ff3 100644
--- a/test/Cursively.Tests/Cursively.Tests.csproj
+++ b/test/Cursively.Tests/Cursively.Tests.csproj
@@ -17,7 +17,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <None Update="TestCsvFiles\*" CopyToOutputDirectory="PreserveNewest" />
+    <None Update="TestCsvFiles\**" CopyToOutputDirectory="PreserveNewest" />
   </ItemGroup>
 
 </Project>
diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv
new file mode 100644
index 0000000..7ff9b39
--- /dev/null
+++ b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv
@@ -0,0 +1,2 @@
+�,
+a,
\ No newline at end of file
diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv
new file mode 100644
index 0000000..7cc53b4
--- /dev/null
+++ b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv
@@ -0,0 +1,3 @@
+a,b,c
+1,2,3
+1,2
\ No newline at end of file
diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv
new file mode 100644
index 0000000..636ad5f
--- /dev/null
+++ b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv
@@ -0,0 +1,2 @@
+﻿"a","b","c"
+"a","b","c",
\ No newline at end of file
diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv
new file mode 100644
index 0000000..ee4f68d
--- /dev/null
+++ b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv
@@ -0,0 +1,2 @@
+a,
+�,
\ No newline at end of file
diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv
new file mode 100644
index 0000000..b4819c6
--- /dev/null
+++ b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv
@@ -0,0 +1,3 @@
+A,B,C
+1,2,3
+do,re,mi
\ No newline at end of file

From e3f88858866f58500a648de4611748c12c56f635 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Sat, 15 Jun 2019 14:51:16 -0400
Subject: [PATCH 20/22] Rename to "ProcessFile". "Entire" was redundant.

---
 src/Cursively/Csv.cs                      | 2 +-
 test/Cursively.Tests/CsvTokenizerTests.cs | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Cursively/Csv.cs b/src/Cursively/Csv.cs
index 917992a..eca7491 100644
--- a/src/Cursively/Csv.cs
+++ b/src/Cursively/Csv.cs
@@ -275,7 +275,7 @@ public static async ValueTask ProcessStreamAsync(Stream csvStream, CsvReaderVisi
         /// <exception cref="PathTooLongException">
         /// See <see cref="FileStream(string, FileMode, FileAccess, FileShare, int, FileOptions)"/>.
         /// </exception>
-        public static unsafe void ProcessEntireFile(string csvFilePath, CsvReaderVisitorBase visitor)
+        public static unsafe void ProcessFile(string csvFilePath, CsvReaderVisitorBase visitor)
         {
             using (var fl = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan))
             {
diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs
index 4095bd8..77f951e 100644
--- a/test/Cursively.Tests/CsvTokenizerTests.cs
+++ b/test/Cursively.Tests/CsvTokenizerTests.cs
@@ -85,7 +85,7 @@ public void MemoryMappedCsvShouldMatchCsvHelper(string filePath)
             var visitor = new StringBufferingVisitor(checked((int)new FileInfo(filePath).Length));
 
             // act
-            Csv.ProcessEntireFile(filePath, visitor);
+            Csv.ProcessFile(filePath, visitor);
             var actual = visitor.Records;
 
             // assert
@@ -101,7 +101,7 @@ public void NonstandardQuotedFieldsShouldNotify()
             var visitor = new NonstandardFieldVisitor(checked((int)new FileInfo(csvFilePath).Length));
 
             // act
-            Csv.ProcessEntireFile(csvFilePath, visitor);
+            Csv.ProcessFile(csvFilePath, visitor);
 
             // assert
             string[] expectedContentsBeforeNonstandardFields =

From 67d558d69965d9dc4bf3119ee2dd297fe8e5fe24 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Sat, 15 Jun 2019 14:51:36 -0400
Subject: [PATCH 21/22] Update README.md

---
 README.md | 101 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 66 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 7b8411d..ea255e9 100644
--- a/README.md
+++ b/README.md
@@ -9,43 +9,10 @@ A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading l
 Documentation is currently being published as [GitHub Pages](https://airbreather.github.io/Cursively/index.html).
 
 ## Usage
-1. Create a subclass of `CsvReaderVisitorBase` with your own logic.
-1. To read a CSV file:
-    - Create a new instance of your visitor.
-    - Create a new instance of `CsvTokenizer`.
-    - Call `CsvTokenizer.ProcessNextChunk` for each chunk of the file.
-    - Call `CsvTokenizer.ProcessEndOfStream` after the last chunk of the file.
-
-## Example
-This demonstrates using Cursively to write the details of a particular UTF-8 encoded file to the console.
+Create a subclass of `CsvReaderVisitorBase` (or one of its own built-in subclasses) with your own logic for processing the individual elements in order.  Then, you have some options.
 
+### Example Visitor
 ```csharp
-public static void ProcessCsvFile(string csvFilePath)
-{
-    var myVisitor = new MyVisitor(maxFieldLength: 1000);
-    var tokenizer = new CsvTokenizer();
-    using (var file = File.OpenRead(csvFilePath))
-    {
-        Console.WriteLine($"Started reading '{csvFilePath}'.");
-        Span<byte> fileReadBuffer = new byte[4096];
-        while (true)
-        {
-            int count = file.Read(fileReadBuffer);
-            if (count == 0)
-            {
-                break;
-            }
-
-            var chunk = fileReadBuffer.Slice(0, count);
-            tokenizer.ProcessNextChunk(chunk, myVisitor);
-        }
-
-        tokenizer.ProcessEndOfStream(myVisitor);
-    }
-
-    Console.WriteLine($"Finished reading '{csvFilePath}'.");
-}
-
 public sealed class MyVisitor : CsvReaderVisitorBase
 {
     private readonly Decoder _utf8Decoder = Encoding.UTF8.GetDecoder();
@@ -88,3 +55,67 @@ public sealed class MyVisitor : CsvReaderVisitorBase
     }
 }
 ```
+
+### Fastest
+All of the other methods of processing the data are built on top of this, so it gives you the most control:
+1. Create a new instance of your visitor.
+1. Create a new instance of `CsvTokenizer`.
+1. Call `CsvTokenizer.ProcessNextChunk` for each chunk of the file.
+1. Call `CsvTokenizer.ProcessEndOfStream` after the last chunk of the file.
+
+Example:
+```csharp
+public static void ProcessCsvFile(string csvFilePath)
+{
+    var myVisitor = new MyVisitor(maxFieldLength: 1000);
+    var tokenizer = new CsvTokenizer();
+    using (var file = File.OpenRead(csvFilePath))
+    {
+        Console.WriteLine($"Started reading '{csvFilePath}'.");
+        Span<byte> fileReadBuffer = new byte[4096];
+        while (true)
+        {
+            int count = file.Read(fileReadBuffer);
+            if (count == 0)
+            {
+                break;
+            }
+
+            var chunk = fileReadBuffer.Slice(0, count);
+            tokenizer.ProcessNextChunk(chunk, myVisitor);
+        }
+
+        tokenizer.ProcessEndOfStream(myVisitor);
+    }
+
+    Console.WriteLine($"Finished reading '{csvFilePath}'.");
+}
+```
+
+### Simpler
+1. Create a new instance of your visitor.
+1. Call one of the `Csv.Process*` methods, passing in whatever format your data is in along with your visitor.
+
+Examples:
+```csharp
+public static void ProcessCsvFile(string csvFilePath)
+{
+    Console.WriteLine($"Started reading '{csvFilePath}'.");
+    Csv.ProcessFile(csvFilePath, new MyVisitor(maxFieldLength: 1000));
+    Console.WriteLine($"Finished reading '{csvFilePath}'.");
+}
+
+public static void ProcessCsvStream(Stream csvStream)
+{
+    Console.WriteLine($"Started reading '{csvFilePath}'.");
+    Csv.ProcessStream(csvStream, new MyVisitor(maxFieldLength: 1000));
+    Console.WriteLine($"Finished reading '{csvFilePath}'.");
+}
+
+public static async ValueTask ProcessCsvStreamAsync(Stream csvStream, IProgress<int> progress = null, CancellationToken cancellationToken = default)
+{
+    Console.WriteLine($"Started reading '{csvFilePath}'.");
+    await Csv.ProcessStreamAsync(csvStream, new MyVisitor(maxFieldLength: 1000), progress, cancellationToken);
+    Console.WriteLine($"Finished reading '{csvFilePath}'.");
+}
+```

From 3de2d12e2f98c57d8696f3a2c4297ce221b96fe9 Mon Sep 17 00:00:00 2001
From: Joe Amenta <airbreather@linux.com>
Date: Sat, 15 Jun 2019 15:27:11 -0400
Subject: [PATCH 22/22] doc updates

---
 doc/benchmark-1.1.0.md | 79 ++++++++++++++++++++++++++++++++++++++++++
 doc/release-notes.md   |  8 +++++
 doc/toc.yml            |  2 +-
 3 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 doc/benchmark-1.1.0.md

diff --git a/doc/benchmark-1.1.0.md b/doc/benchmark-1.1.0.md
new file mode 100644
index 0000000..a588ec9
--- /dev/null
+++ b/doc/benchmark-1.1.0.md
@@ -0,0 +1,79 @@
+This benchmark tests the simple act of counting how many records are in a CSV file.  It's not a simple count of how many lines are in the text file: line breaks within quoted fields must be treated as data, and multiple line breaks in a row must be treated as one, since each record must have at least one field.  Therefore, assuming correct implementations, this benchmark should test the raw CSV processing speed.
+
+Cursively eliminates a ton of overhead found in libraries such as CsvHelper by restricting the allowed input encodings and using the visitor pattern as its only means of output.  Cursively can scan through the original bytes of the input to do its work, and it can give slices of the input data directly to the consumer without having to copy or allocate.
+
+Therefore, these benchmarks are somewhat biased in favor of Cursively, as CsvHelper relies on external code to transform the data to UTF-16.  This isn't as unfair as that makes it sound: the overwhelming majority of input files are probably UTF-8 anyway (or a compatible SBCS), so this transformation is something that practically every user will experience.
+
+- Input files can be found here: https://github.com/airbreather/Cursively/tree/v1.1.0/test/Cursively.Benchmark/large-csv-files.zip
+- Benchmark source code is this: https://github.com/airbreather/Cursively/tree/v1.1.0/test/Cursively.Benchmark
+
+Raw BenchmarkDotNet output is at the bottom, but here are some numbers derived from it.  The data was fully loaded in main memory when running these tests.  This summary also does not indicate anything about the GC pressure:
+
+|CSV File|Runtime|Library|Throughput|
+|-|-|-|-|
+|100 records / 10,000 tiny fields each|.NET 4.7.2|Cursively|336.06 MiB/s|
+|100 records / 10,000 tiny fields each|.NET 4.7.2|CsvHelper|22.04 MiB/s|
+|100 records / 10,000 tiny fields each|.NET Core 2.2.5|Cursively|487.59 MiB/s|
+|100 records / 10,000 tiny fields each|.NET Core 2.2.5|CsvHelper|27.31 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|Cursively|178.23 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|CsvHelper|24.33 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|Cursively|303.67 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|CsvHelper|29.20 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET 4.7.2|Cursively|176.71 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET 4.7.2|CsvHelper|14.45 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|Cursively|306.49 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|CsvHelper|15.15 MiB/s|
+|Mock data from Mockaroo|.NET 4.7.2|Cursively|2,711.41 MiB/s|
+|Mock data from Mockaroo|.NET 4.7.2|CsvHelper|72.50 MiB/s|
+|Mock data from Mockaroo|.NET Core 2.2.5|Cursively|3,755.55 MiB/s|
+|Mock data from Mockaroo|.NET Core 2.2.5|CsvHelper|75.05 MiB/s|
+|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET 4.7.2|Cursively|390.75 MiB/s|
+|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET 4.7.2|CsvHelper|40.15 MiB/s|
+|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET Core 2.2.5|Cursively|607.81 MiB/s|
+|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET Core 2.2.5|CsvHelper|39.90 MiB/s|
+
+Raw BenchmarkDotNet output:
+
+``` ini
+
+BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362
+Intel Core i7-6850K CPU 3.60GHz (Skylake), 1 CPU, 12 logical and 6 physical cores
+.NET Core SDK=3.0.100-preview6-012264
+  [Host]     : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT
+  Job-DDQSKN : .NET Framework 4.7.2 (CLR 4.0.30319.42000), 64bit RyuJIT-v4.8.3801.0
+  Job-RTHUVO : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT
+
+Server=True  
+
+```
+|                  Method | Runtime |              csvFile |         Mean |     Error |    StdDev | Ratio | RatioSD |       Gen 0 |      Gen 1 |    Gen 2 |    Allocated |
+|------------------------ |-------- |--------------------- |-------------:|----------:|----------:|------:|--------:|------------:|-----------:|---------:|-------------:|
+| CountRowsUsingCursively |     Clr |     100-huge-records |     8.231 ms | 0.0839 ms | 0.0743 ms |  1.00 |    0.00 |           - |          - |        - |        128 B |
+| CountRowsUsingCsvHelper |     Clr |     100-huge-records |   125.493 ms | 1.1717 ms | 1.0387 ms | 15.25 |    0.21 |  17250.0000 |  6750.0000 | 750.0000 |  110560856 B |
+|                         |         |                      |              |           |           |       |         |             |            |          |              |
+| CountRowsUsingCursively |    Core |     100-huge-records |     5.673 ms | 0.0073 ms | 0.0068 ms |  1.00 |    0.00 |           - |          - |        - |         48 B |
+| CountRowsUsingCsvHelper |    Core |     100-huge-records |   101.277 ms | 0.2342 ms | 0.2190 ms | 17.85 |    0.05 |    400.0000 |   200.0000 |        - |  110256320 B |
+|                         |         |                      |              |           |           |       |         |             |            |          |              |
+| CountRowsUsingCursively |     Clr | 100-h(...)uoted [23] |    26.222 ms | 0.0260 ms | 0.0231 ms |  1.00 |    0.00 |           - |          - |        - |        256 B |
+| CountRowsUsingCsvHelper |     Clr | 100-h(...)uoted [23] |   192.090 ms | 0.9954 ms | 0.9311 ms |  7.33 |    0.04 |  25000.0000 | 11000.0000 | 666.6667 |  154027456 B |
+|                         |         |                      |              |           |           |       |         |             |            |          |              |
+| CountRowsUsingCursively |    Core | 100-h(...)uoted [23] |    15.390 ms | 0.0450 ms | 0.0399 ms |  1.00 |    0.00 |           - |          - |        - |         48 B |
+| CountRowsUsingCsvHelper |    Core | 100-h(...)uoted [23] |   160.043 ms | 0.4644 ms | 0.4344 ms | 10.40 |    0.04 |    333.3333 |          - |        - |  153579848 B |
+|                         |         |                      |              |           |           |       |         |             |            |          |              |
+| CountRowsUsingCursively |     Clr |    10k-empty-records |    54.007 ms | 0.3061 ms | 0.2556 ms |  1.00 |    0.00 |           - |          - |        - |        819 B |
+| CountRowsUsingCsvHelper |     Clr |    10k-empty-records |   661.502 ms | 3.1801 ms | 2.9747 ms | 12.24 |    0.08 |  66000.0000 |  2000.0000 |        - |  422077104 B |
+|                         |         |                      |              |           |           |       |         |             |            |          |              |
+| CountRowsUsingCursively |    Core |    10k-empty-records |    31.178 ms | 0.2056 ms | 0.1924 ms |  1.00 |    0.00 |           - |          - |        - |         48 B |
+| CountRowsUsingCsvHelper |    Core |    10k-empty-records |   630.683 ms | 1.2503 ms | 1.1084 ms | 20.23 |    0.13 |   2000.0000 |          - |        - |  420832856 B |
+|                         |         |                      |              |           |           |       |         |             |            |          |              |
+| CountRowsUsingCursively |     Clr |               mocked |     4.478 ms | 0.0071 ms | 0.0067 ms |  1.00 |    0.00 |           - |          - |        - |         64 B |
+| CountRowsUsingCsvHelper |     Clr |               mocked |   167.477 ms | 0.3523 ms | 0.3296 ms | 37.40 |    0.08 |  18333.3333 |   333.3333 |        - |  116105312 B |
+|                         |         |                      |              |           |           |       |         |             |            |          |              |
+| CountRowsUsingCursively |    Core |               mocked |     3.233 ms | 0.0063 ms | 0.0059 ms |  1.00 |    0.00 |           - |          - |        - |         48 B |
+| CountRowsUsingCsvHelper |    Core |               mocked |   161.791 ms | 0.3473 ms | 0.3249 ms | 50.05 |    0.15 |    333.3333 |          - |        - |  115757736 B |
+|                         |         |                      |              |           |           |       |         |             |            |          |              |
+| CountRowsUsingCursively |     Clr |       worldcitiespop |   369.738 ms | 0.6855 ms | 0.6077 ms |  1.00 |    0.00 |           - |          - |        - |       8192 B |
+| CountRowsUsingCsvHelper |     Clr |       worldcitiespop | 3,598.421 ms | 2.0735 ms | 1.9396 ms |  9.73 |    0.02 | 493000.0000 |  7000.0000 |        - | 3105811440 B |
+|                         |         |                      |              |           |           |       |         |             |            |          |              |
+| CountRowsUsingCursively |    Core |       worldcitiespop |   237.695 ms | 0.2994 ms | 0.2800 ms |  1.00 |    0.00 |           - |          - |        - |         48 B |
+| CountRowsUsingCsvHelper |    Core |       worldcitiespop | 3,620.550 ms | 3.1766 ms | 2.8160 ms | 15.23 |    0.02 |  15000.0000 |          - |        - | 3096694312 B |
diff --git a/doc/release-notes.md b/doc/release-notes.md
index 8c1dc2d..1096e59 100644
--- a/doc/release-notes.md
+++ b/doc/release-notes.md
@@ -1,4 +1,12 @@
 # Cursively Release Notes
 
+## [1.1.0](https://github.com/airbreather/Cursively/milestone/1)
+- Several further performance optimizations.  Most significantly, inlining and tuning a critical `ReadOnlySpan<T>` extension method.
+    - In some cases, this increased throughput by a factor of 3.
+- Added hooks for visitor implementations to detect situations where the stream does not conform to the RFC 4180 rules for quoted fields ([#4](https://github.com/airbreather/Cursively/issues/4))
+- Added support to customize the field delimiter byte ([#11](https://github.com/airbreather/Cursively/issues/11))
+- Added helpers to avoid having to use `CsvTokenizer` directly in most cases ([#9](https://github.com/airbreather/Cursively/issues/9), [#10](https://github.com/airbreather/Cursively/issues/10))
+- Added an intermediate abstract visitor class that handles UTF-8 encoded headers ([#5](https://github.com/airbreather/Cursively/issues/5))
+
 ## 1.0.0
 - Initial release.
diff --git a/doc/toc.yml b/doc/toc.yml
index 0e11d66..c94d485 100644
--- a/doc/toc.yml
+++ b/doc/toc.yml
@@ -3,7 +3,7 @@
 - name: API Documentation
   href: obj/api/
 - name: Benchmark
-  href: benchmark-1.0.0.md
+  href: benchmark-1.1.0.md
 - name: Release Notes
   href: release-notes.md
 - name: NuGet Package