From e62e9c5deb3ac37b18f5cfb0a1360913cd31451b Mon Sep 17 00:00:00 2001 From: Henning Schmiedehausen Date: Mon, 1 Apr 2019 15:30:04 +0800 Subject: [PATCH] Allow escaping of non-printable characters in CSV output/input This is a proposed solution for #124. It introduces a new Feature, `ESCAPE_CONTROL_CHARS_WITH_ESCAPE_CHAR`, which will apply the standard ASCII escapes from JSON to all characters that the CSV generator writes. If this solution is workable, I will add tests. --- .../dataformat/csv/CsvCharacterEscapes.java | 101 ++++++++++ .../jackson/dataformat/csv/CsvFactory.java | 2 + .../jackson/dataformat/csv/CsvGenerator.java | 35 +++- .../dataformat/csv/impl/CsvEncoder.java | 178 ++++++++++++++++-- 4 files changed, 304 insertions(+), 12 deletions(-) create mode 100644 csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvCharacterEscapes.java diff --git a/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvCharacterEscapes.java b/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvCharacterEscapes.java new file mode 100644 index 00000000..cc285378 --- /dev/null +++ b/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvCharacterEscapes.java @@ -0,0 +1,101 @@ +package com.fasterxml.jackson.dataformat.csv; + +import com.fasterxml.jackson.core.SerializableString; +import com.fasterxml.jackson.core.io.CharTypes; +import com.fasterxml.jackson.core.io.CharacterEscapes; +import com.fasterxml.jackson.dataformat.csv.CsvGenerator.Feature; + +/** + * Character escapes for CSV. There are multiple types of escapes. + * + * + */ +public final class CsvCharacterEscapes extends CharacterEscapes +{ + + private static final long serialVersionUID = 1L; + + // No character escapes, every character returned as is. + private static final CsvCharacterEscapes sNoEscapesInstance = new CsvCharacterEscapes(new int[0]); + + // Only escape quotes, controlled by {@link Feature#ESCAPE_QUOTE_CHAR_WITH_ESCAPE_CHAR}. + private static final CsvCharacterEscapes sQuoteEscapesInstance; + + // Only escape control chars, do *not* escape the quote char. See (@link Feature#ESCAPE_CONTROL_CHARS_WITH_ESCAPE_CHAR}. + private static final CsvCharacterEscapes sControlEscapesInstance; + + // Escape control chars and the quote char. + private static final CsvCharacterEscapes sControlQuoteEscapesInstance = new CsvCharacterEscapes(CharacterEscapes.standardAsciiEscapesForJSON()); + + private static final CsvCharacterEscapes [] sEscapes; + + static { + int[] quoteEscapes = new int[(int) '"' + 1]; + quoteEscapes[(int) '"'] = '"'; + sQuoteEscapesInstance = new CsvCharacterEscapes(quoteEscapes); + + int[] controlEscapes = CharacterEscapes.standardAsciiEscapesForJSON(); + controlEscapes['"'] = 0; // do not escape ", double it up. + sControlEscapesInstance = new CsvCharacterEscapes(controlEscapes); + + sEscapes = new CsvCharacterEscapes[4]; + sEscapes[0] = sNoEscapesInstance; + sEscapes[1] = sQuoteEscapesInstance; + sEscapes[2] = sControlEscapesInstance; + sEscapes[3] = sControlQuoteEscapesInstance; + } + + + private final int[] escapes; + + private CsvCharacterEscapes(int[] escapes) + { + this.escapes = escapes; + } + + public static CsvCharacterEscapes noEscapesInstance() + { + return sNoEscapesInstance; + } + + public static CsvCharacterEscapes quoteEscapesInstance() + { + return sQuoteEscapesInstance; + } + + public static CsvCharacterEscapes controlEscapesInstance() + { + return sControlEscapesInstance; + } + + public static CsvCharacterEscapes controlQuoteEscapesInstance() + { + return sControlQuoteEscapesInstance; + } + + public static CsvCharacterEscapes fromCsvFeatures(int csvFeatures) + { + int idx = 0; + idx |= CsvGenerator.Feature.ESCAPE_QUOTE_CHAR_WITH_ESCAPE_CHAR.enabledIn(csvFeatures) ? 1 : 0; + idx |= Feature.ESCAPE_CONTROL_CHARS_WITH_ESCAPE_CHAR.enabledIn(csvFeatures) ? 2 : 0; + + return sEscapes[idx]; + } + + @Override + public SerializableString getEscapeSequence(int ch) + { + return null; // unused for CSV escapes + } + + @Override + public int[] getEscapeCodesForAscii() + { + return escapes; + } +} diff --git a/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvFactory.java b/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvFactory.java index ab145206..e338d64d 100644 --- a/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvFactory.java +++ b/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvFactory.java @@ -419,6 +419,8 @@ protected CsvGenerator _createGenerator(IOContext ctxt, Writer out) throws IOExc CsvGenerator gen = new CsvGenerator(ctxt, _generatorFeatures, _csvGeneratorFeatures, _objectCodec, out, _schema); // any other initializations? No? + + gen.setCharacterEscapes(CsvCharacterEscapes.fromCsvFeatures(_csvGeneratorFeatures)); return gen; } diff --git a/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvGenerator.java b/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvGenerator.java index 710f9bc3..2f310efd 100644 --- a/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvGenerator.java +++ b/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvGenerator.java @@ -7,6 +7,7 @@ import com.fasterxml.jackson.core.*; import com.fasterxml.jackson.core.base.GeneratorBase; +import com.fasterxml.jackson.core.io.CharacterEscapes; import com.fasterxml.jackson.core.json.JsonWriteContext; import com.fasterxml.jackson.core.io.IOContext; import com.fasterxml.jackson.dataformat.csv.impl.CsvEncoder; @@ -81,7 +82,19 @@ public enum Feature * * @since 2.9.3 */ - ESCAPE_QUOTE_CHAR_WITH_ESCAPE_CHAR(false) + ESCAPE_QUOTE_CHAR_WITH_ESCAPE_CHAR(false), + + /** + * Feature that determines whether control characters (non-printable) are escaped using the + * configured escape character. This feature allows LF and CR characters to be output as
\n
+ * and
\r
instead of being echoed out. This is a compatibility feature for some + * parsers that can not read such output back in. + *

+ * Default value is false so that control characters are echoed out (backwards compatible). + * + * @since 2.9.9 + */ + ESCAPE_CONTROL_CHARS_WITH_ESCAPE_CHAR(false) ; protected final boolean _defaultState; @@ -146,6 +159,8 @@ private Feature(boolean defaultState) { // note: can not be final since we may need to re-create it for new schema protected CsvEncoder _writer; + protected CharacterEscapes _characterEscapes = null; + /* /********************************************************** /* Output state @@ -220,6 +235,8 @@ public CsvGenerator(IOContext ctxt, int jsonFeatures, int csvFeatures, _formatFeatures = csvFeatures; _schema = schema; _writer = new CsvEncoder(ctxt, csvFeatures, out, schema); + + _writer.setOutputEscapes(CsvCharacterEscapes.fromCsvFeatures(csvFeatures).getEscapeCodesForAscii()); } public CsvGenerator(IOContext ctxt, int jsonFeatures, int csvFeatures, @@ -312,6 +329,22 @@ public JsonGenerator overrideFormatFeatures(int values, int mask) return this; } + public JsonGenerator setCharacterEscapes(CharacterEscapes esc) { + this._characterEscapes = esc; + if (esc != null) { + this._writer.setOutputEscapes(esc.getEscapeCodesForAscii()); + } else { + this._writer.setOutputEscapes(CsvCharacterEscapes.fromCsvFeatures(_formatFeatures).getEscapeCodesForAscii()); + } + + return this; + } + + public CharacterEscapes getCharacterEscapes() { + return this._characterEscapes; + } + + /* /********************************************************** /* Public API, capability introspection methods diff --git a/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/impl/CsvEncoder.java b/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/impl/CsvEncoder.java index da6b9907..3ac45ca9 100644 --- a/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/impl/CsvEncoder.java +++ b/csv/src/main/java/com/fasterxml/jackson/dataformat/csv/impl/CsvEncoder.java @@ -4,8 +4,10 @@ import java.io.Writer; import java.util.Arrays; +import com.fasterxml.jackson.core.io.CharTypes; import com.fasterxml.jackson.core.io.IOContext; import com.fasterxml.jackson.dataformat.csv.CsvGenerator; +import com.fasterxml.jackson.dataformat.csv.CsvGenerator.Feature; import com.fasterxml.jackson.dataformat.csv.CsvSchema; /** @@ -15,6 +17,14 @@ */ public class CsvEncoder { + + /* + * default set of escaped characters. + */ + private static final int [] sOutputEscapes = new int[0]; + + final protected static char[] HEX_CHARS = CharTypes.copyHexChars(); + /* As an optimization we try coalescing short writes into * buffer; but pass longer directly. */ @@ -29,7 +39,13 @@ public class CsvEncoder private final static char[] TRUE_CHARS = "true".toCharArray(); private final static char[] FALSE_CHARS = "false".toCharArray(); - + + /** + * Currently active set of output escape code definitions (whether + * and how to escape or not). + */ + protected int[] _outputEscapes = sOutputEscapes; + /* /********************************************************** /* Configuration @@ -93,8 +109,18 @@ public class CsvEncoder protected boolean _cfgEscapeQuoteCharWithEscapeChar; + /** + * @since 2.9.9 + */ + protected boolean _cfgEscapeControlCharWithEscapeChar; + protected final char _cfgQuoteCharEscapeChar; + /** + * @since 2.9.9 + */ + protected final char _cfgControlCharEscapeChar; + /* /********************************************************** /* Output state @@ -174,6 +200,7 @@ public CsvEncoder(IOContext ctxt, int csvFeatures, Writer out, CsvSchema schema) _cfgAlwaysQuoteStrings = CsvGenerator.Feature.ALWAYS_QUOTE_STRINGS.enabledIn(csvFeatures); _cfgAlwaysQuoteEmptyStrings = CsvGenerator.Feature.ALWAYS_QUOTE_EMPTY_STRINGS.enabledIn(csvFeatures); _cfgEscapeQuoteCharWithEscapeChar = CsvGenerator.Feature.ESCAPE_QUOTE_CHAR_WITH_ESCAPE_CHAR.enabledIn(csvFeatures); + _cfgEscapeControlCharWithEscapeChar = Feature.ESCAPE_CONTROL_CHARS_WITH_ESCAPE_CHAR.enabledIn(csvFeatures); _outputBuffer = ctxt.allocConcatBuffer(); _bufferRecyclable = true; @@ -198,6 +225,8 @@ public CsvEncoder(IOContext ctxt, int csvFeatures, Writer out, CsvSchema schema) _cfgQuoteCharacter, _cfgEscapeCharacter ); + + _cfgControlCharEscapeChar = _cfgEscapeCharacter > 0 ? (char) _cfgEscapeCharacter : '\\'; } public CsvEncoder(CsvEncoder base, CsvSchema newSchema) @@ -209,12 +238,14 @@ public CsvEncoder(CsvEncoder base, CsvSchema newSchema) _cfgAlwaysQuoteStrings = base._cfgAlwaysQuoteStrings; _cfgAlwaysQuoteEmptyStrings = base._cfgAlwaysQuoteEmptyStrings; _cfgEscapeQuoteCharWithEscapeChar = base._cfgEscapeQuoteCharWithEscapeChar; + _cfgEscapeControlCharWithEscapeChar = base._cfgEscapeControlCharWithEscapeChar; _outputBuffer = base._outputBuffer; _bufferRecyclable = base._bufferRecyclable; _outputEnd = base._outputEnd; _out = base._out; _cfgMaxQuoteCheckChars = base._cfgMaxQuoteCheckChars; + _outputEscapes = base._outputEscapes; _cfgColumnSeparator = newSchema.getColumnSeparator(); _cfgQuoteCharacter = newSchema.getQuoteChar(); @@ -229,6 +260,7 @@ public CsvEncoder(CsvEncoder base, CsvSchema newSchema) newSchema.getQuoteChar(), newSchema.getEscapeChar() ); + _cfgControlCharEscapeChar = _cfgEscapeCharacter > 0 ? (char) _cfgEscapeCharacter : '\\'; } private final char _getQuoteCharEscapeChar( @@ -275,10 +307,16 @@ public CsvEncoder overrideFormatFeatures(int feat) { _cfgAlwaysQuoteStrings = CsvGenerator.Feature.ALWAYS_QUOTE_STRINGS.enabledIn(feat); _cfgAlwaysQuoteEmptyStrings = CsvGenerator.Feature.ALWAYS_QUOTE_EMPTY_STRINGS.enabledIn(feat); _cfgEscapeQuoteCharWithEscapeChar = CsvGenerator.Feature.ESCAPE_QUOTE_CHAR_WITH_ESCAPE_CHAR.enabledIn(feat); + _cfgEscapeControlCharWithEscapeChar = Feature.ESCAPE_CONTROL_CHARS_WITH_ESCAPE_CHAR.enabledIn(feat); } return this; } + public CsvEncoder setOutputEscapes(int [] esc) { + _outputEscapes = (esc != null) ? esc : sOutputEscapes; + return this; + } + /* /********************************************************** /* Read-access to output state @@ -696,6 +734,9 @@ private void writeRawLong(String text) throws IOException public void _writeQuoted(String text) throws IOException { + final int[] escCodes = _outputEscapes; + final int escLen = escCodes.length; + if (_outputTail >= _outputEnd) { _flushBuffer(); } @@ -718,8 +759,15 @@ public void _writeQuoted(String text) throws IOException text.getChars(0, len, buf, ptr); final int end = ptr+len; - - for (; ptr < end && buf[ptr] != q; ++ptr) { } + + for (; ptr < end; ++ptr) { + char c = buf[ptr]; + // see if any of the characters need escaping. + // if yes, fall back to the more convoluted write method + if ((c == q) || (c < escLen && escCodes[c] != 0)) { + break; // for + } + } if (ptr == end) { // all good, no quoting or escaping! _outputBuffer[ptr] = q; @@ -731,16 +779,28 @@ public void _writeQuoted(String text) throws IOException protected void _writeQuoted(String text, char q, int i) throws IOException { + final int[] escCodes = _outputEscapes; + final int escLen = escCodes.length; + final char[] buf = _outputBuffer; _outputTail += i; final int len = text.length(); for (; i < len; ++i) { char c = text.charAt(i); + if (c < escLen) { + int escCode = escCodes[c]; + if (escCode != 0) { // for escape control and double quotes, c will be 0 + _appendCharacterEscape(c, escCode); + continue; // for + } + } + if (c == q) { // double up if (_outputTail >= _outputEnd) { _flushBuffer(); } - buf[_outputTail++] = _cfgQuoteCharEscapeChar; + + buf[_outputTail++] = _cfgQuoteCharEscapeChar; // this will be the quote } if (_outputTail >= _outputEnd) { _flushBuffer(); @@ -755,12 +815,23 @@ protected void _writeQuoted(String text, char q, int i) throws IOException private final void _writeLongQuoted(String text, char q) throws IOException { + final int[] escCodes = _outputEscapes; + final int escLen = escCodes.length; + final int len = text.length(); for (int i = 0; i < len; ++i) { if (_outputTail >= _outputEnd) { _flushBuffer(); } char c = text.charAt(i); + if (c < escLen) { + int escCode = escCodes[c]; + if (escCode != 0) { // for escape control and double quotes, c will be 0 + _appendCharacterEscape(c, escCode); + continue; // for + } + } + if (c == q) { // double up _outputBuffer[_outputTail++] = _cfgQuoteCharEscapeChar; if (_outputTail >= _outputEnd) { @@ -777,6 +848,9 @@ private final void _writeLongQuoted(String text, char q) throws IOException public void _writeQuotedAndEscaped(String text, char esc) throws IOException { + final int[] escCodes = _outputEscapes; + final int escLen = escCodes.length; + if (_outputTail >= _outputEnd) { _flushBuffer(); } @@ -796,7 +870,7 @@ public void _writeQuotedAndEscaped(String text, char esc) throws IOException final int end = ptr+len; for (; ptr < end; ++ptr) { char c = buf[ptr]; - if ((c == q) || (c == esc)) { + if ((c == q) || (c == esc) || (c < escLen && escCodes[c] != 0)) { break; } } @@ -811,17 +885,36 @@ public void _writeQuotedAndEscaped(String text, char esc) throws IOException protected void _writeQuotedAndEscaped(String text, char q, char esc, int i) throws IOException { + final int[] escCodes = _outputEscapes; + final int escLen = escCodes.length; + final char[] buf = _outputBuffer; _outputTail += i; final int len = text.length(); for (; i < len; ++i) { char c = text.charAt(i); - if ((c == q) || (c == esc)) { // double up, either way + if (c < escLen) { + int escCode = escCodes[c]; + if (escCode != 0) { // for escape control and double quotes, c will be 0 + _appendCharacterEscape(c, escCode); + continue; // for + } + } + + if (c == q) { // double up + if (_outputTail >= _outputEnd) { + _flushBuffer(); + } + + _outputBuffer[_outputTail++] = _cfgQuoteCharEscapeChar; + } else if (c == esc) { // double up if (_outputTail >= _outputEnd) { _flushBuffer(); } - buf[_outputTail++] = (c == q) ? _cfgQuoteCharEscapeChar : c; + + _outputBuffer[_outputTail++] = _cfgControlCharEscapeChar; } + if (_outputTail >= _outputEnd) { _flushBuffer(); } @@ -835,6 +928,9 @@ protected void _writeQuotedAndEscaped(String text, char q, char esc, int i) thro private final void _writeLongQuotedAndEscaped(String text, char esc) throws IOException { + final int[] escCodes = _outputEscapes; + final int escLen = escCodes.length; + final int len = text.length(); // NOTE: caller should guarantee quote char is valid (not -1) at this point: final char q = (char) _cfgQuoteCharacter; @@ -844,12 +940,26 @@ private final void _writeLongQuotedAndEscaped(String text, char esc) throws IOEx _flushBuffer(); } char c = text.charAt(i); - if ((c == q) || (c == esc)) { // double up, either way - _outputBuffer[_outputTail++] = (c == q) ? quoteEscape : c; + if (c < escLen) { + int escCode = escCodes[c]; + if (escCode != 0) { // for escape control and double quotes, c will be 0 + _appendCharacterEscape(c, escCode); + continue; // for + } + } + + if (c == q) { // double up + _outputBuffer[_outputTail++] = _cfgQuoteCharEscapeChar; + if (_outputTail >= _outputEnd) { + _flushBuffer(); + } + } else if (c == esc) { // double up + _outputBuffer[_outputTail++] = _cfgControlCharEscapeChar; if (_outputTail >= _outputEnd) { _flushBuffer(); } } + _outputBuffer[_outputTail++] = c; } if (_outputTail >= _outputEnd) { @@ -954,11 +1064,15 @@ protected final boolean _needsQuotingLoose(String value, int esc) protected boolean _needsQuotingStrict(String value) { final int minSafe = _cfgMinSafeChar; + + final int[] escCodes = _outputEscapes; + final int escLen = escCodes.length; + for (int i = 0, len = value.length(); i < len; ++i) { int c = value.charAt(i); if (c < minSafe) { if (c == _cfgColumnSeparator || c == _cfgQuoteCharacter - || c == '\r' || c == '\n' + || (c < escLen && escCodes[c] != 0) // 31-Dec-2014, tatu: Comment lines start with # so quote if starts with # || (c == '#' && i == 0)) { return true; @@ -974,11 +1088,15 @@ protected boolean _needsQuotingStrict(String value) protected boolean _needsQuotingStrict(String value, int esc) { final int minSafe = _cfgMinSafeChar; + + final int[] escCodes = _outputEscapes; + final int escLen = escCodes.length; + for (int i = 0, len = value.length(); i < len; ++i) { int c = value.charAt(i); if (c < minSafe) { if (c == _cfgColumnSeparator || c == _cfgQuoteCharacter - || c == '\r' || c == '\n' + || (c < escLen && escCodes[c] != 0) // 31-Dec-2014, tatu: Comment lines start with # so quote if starts with # || (c == '#' && i == 0)) { return true; @@ -1016,4 +1134,42 @@ public void _releaseBuffers() _ioContext.releaseConcatBuffer(buf); } } + + /** + * Method called to append escape sequence for given character, at the + * end of standard output buffer; or if not possible, write out directly. + */ + private void _appendCharacterEscape(char ch, int escCode) throws IOException + { + if (escCode >= 0) { // \\N (2 char) + if ((_outputTail + 2) > _outputEnd) { + _flushBuffer(); + } + _outputBuffer[_outputTail++] = _cfgControlCharEscapeChar; + _outputBuffer[_outputTail++] = (char) escCode; + return; + } + + if ((_outputTail + 5) >= _outputEnd) { + _flushBuffer(); + } + int ptr = _outputTail; + char[] buf = _outputBuffer; + buf[ptr++] = '\\'; + buf[ptr++] = 'u'; + // We know it's a control char, so only the last 2 chars are non-0 + if (ch > 0xFF) { // beyond 8 bytes + int hi = (ch >> 8) & 0xFF; + buf[ptr++] = HEX_CHARS[hi >> 4]; + buf[ptr++] = HEX_CHARS[hi & 0xF]; + ch &= 0xFF; + } else { + buf[ptr++] = '0'; + buf[ptr++] = '0'; + } + buf[ptr++] = HEX_CHARS[ch >> 4]; + buf[ptr++] = HEX_CHARS[ch & 0xF]; + _outputTail = ptr; + return; + } }