diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x index 901c5e723..9ab954a0a 100644 --- a/release-notes/VERSION-2.x +++ b/release-notes/VERSION-2.x @@ -14,6 +14,7 @@ Modules: #239: Should validate UTF-8 multi-byte validity for short decode path too #248: Deprecate `CloseSafeUTF8Writer`, remove use +#252: Make `SmileFactory` support `JsonFactory.Feature.CANONICALIZE_FIELD_NAMES` - `Ion-java` dep 1.4.0 -> 1.8.0 2.12.2 (03-Mar-2021) diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileFactory.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileFactory.java index e98d729cc..fdf419ae5 100644 --- a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileFactory.java +++ b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileFactory.java @@ -412,6 +412,8 @@ public SmileGenerator createGenerator(OutputStream out) throws IOException { @Override public NonBlockingByteArrayParser createNonBlockingByteArrayParser() throws IOException { IOContext ctxt = _createContext(null, false); + // 13-Mar-2021, tatu: [dataformats-binary#252] Leave async parser with + // always-canonicalizing, for now (2.13) -- to be improved in future ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures); return new NonBlockingByteArrayParser(ctxt, _parserFeatures, _smileParserFeatures, can); } diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParser.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParser.java index 572396ab7..0590c339a 100644 --- a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParser.java +++ b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParser.java @@ -763,14 +763,8 @@ public String nextFieldName() throws IOException } case 2: // short ASCII { - int len = 1 + (ch & 0x3f); - String name = _findDecodedFromSymbols(len); - if (name != null) { - _inputPtr += len; - } else { - name = _decodeShortAsciiName(len); - name = _addDecodedToSymbols(len, name); - } + final int len = 1 + (ch & 0x3f); + final String name = _findOrDecodeShortAsciiName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); @@ -796,16 +790,10 @@ public String nextFieldName() throws IOException } } else { final int len = ch + 2; // values from 2 to 57... - String name = _findDecodedFromSymbols(len); - if (name != null) { - _inputPtr += len; - } else { - name = _decodeShortUnicodeName(len); - name = _addDecodedToSymbols(len, name); - } + final String name = _findOrDecodeShortUnicodeName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { - _seenNames = _expandSeenNames(_seenNames); + _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } @@ -1329,14 +1317,8 @@ protected final JsonToken _handleFieldName() throws IOException return JsonToken.FIELD_NAME; case 2: // short ASCII { - int len = 1 + (ch & 0x3f); - String name = _findDecodedFromSymbols(len); - if (name != null) { - _inputPtr += len; - } else { - name = _decodeShortAsciiName(len); - name = _addDecodedToSymbols(len, name); - } + final int len = 1 + (ch & 0x3f); + final String name = _findOrDecodeShortAsciiName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); @@ -1360,16 +1342,10 @@ protected final JsonToken _handleFieldName() throws IOException } } else { final int len = ch + 2; // values from 2 to 57... - String name = _findDecodedFromSymbols(len); - if (name != null) { - _inputPtr += len; - } else { - name = _decodeShortUnicodeName(len); - name = _addDecodedToSymbols(len, name); - } + final String name = _findOrDecodeShortUnicodeName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { - _seenNames = _expandSeenNames(_seenNames); + _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } @@ -1384,6 +1360,46 @@ protected final JsonToken _handleFieldName() throws IOException return null; } + private String _findOrDecodeShortAsciiName(final int len) throws IOException + { + // First things first: must ensure all in buffer + if ((_inputEnd - _inputPtr) < len) { + _loadToHaveAtLeast(len); + } + if (_symbolsCanonical) { + String name = _findDecodedFromSymbols(len); + if (name != null) { + _inputPtr += len; + } else { + name = _decodeShortAsciiName(len); + name = _addDecodedToSymbols(len, name); + } + return name; + } + // if not canonicalizing, much simpler: + return _decodeShortAsciiName(len); + } + + private String _findOrDecodeShortUnicodeName(final int len) throws IOException + { + // First things first: must ensure all in buffer + if ((_inputEnd - _inputPtr) < len) { + _loadToHaveAtLeast(len); + } + if (_symbolsCanonical) { + String name = _findDecodedFromSymbols(len); + if (name != null) { + _inputPtr += len; + } else { + name = _decodeShortUnicodeName(len); + name = _addDecodedToSymbols(len, name); + } + return name; + } + // if not canonicalizing, much simpler: + return _decodeShortUnicodeName(len); + } + /** * Method called to try to expand shared name area to fit one more potentially * shared String. If area is already at its biggest size, will just clear @@ -1474,7 +1490,6 @@ private final String _decodeShortUnicodeName(int len) int outPtr = 0; char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); int inPtr = _inputPtr; - _inputPtr += len; final int[] codes = SmileConstants.sUtf8UnitLengths; final byte[] inBuf = _inputBuffer; for (int end = inPtr + len; inPtr < end; ) { @@ -1502,16 +1517,21 @@ private final String _decodeShortUnicodeName(int len) i = 0xDC00 | (i & 0x3FF); break; default: // invalid - _reportError("Invalid byte "+Integer.toHexString(i)+" in short Unicode text block"); + // Update pointer here to point to (more) correct location + _inputPtr = inPtr; + _reportError("Invalid byte 0x"+Integer.toHexString(i)+" in short Unicode text block"); } } outBuf[outPtr++] = (char) i; } + // let's only update offset here, so error message accurate + _inputPtr += len; return _textBuffer.setCurrentAndReturn(outPtr); } // note: slightly edited copy of UTF8StreamParser.addName() - private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLen) + private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLen, + boolean addToSymbolTable) throws IOException { int lastQuadBytes = byteLen & 3; @@ -1611,7 +1631,10 @@ private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLe if (lastQuadBytes > 0) { quads[quadLen-1] = lastQuad; } - return _symbols.addName(baseName, quads, quadLen); + if (addToSymbolTable) { + return _symbols.addName(baseName, quads, quadLen); + } + return baseName; } private final void _handleLongFieldName() throws IOException @@ -1674,9 +1697,11 @@ private final void _handleLongFieldName() throws IOException byteLen += bytes; } // Know this name already? - String name = _symbols.findName(_quadBuffer, quads); + String name = _symbolsCanonical ? + _symbols.findName(_quadBuffer, quads) : null; if (name == null) { - name = _decodeLongUnicodeName(_quadBuffer, byteLen, quads); + name = _decodeLongUnicodeName(_quadBuffer, byteLen, quads, + _symbolsCanonical); } if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { @@ -1689,13 +1714,12 @@ private final void _handleLongFieldName() throws IOException /** * Helper method for trying to find specified encoded UTF-8 byte sequence - * from symbol table; if successful avoids actual decoding to String + * from symbol table; if successful avoids actual decoding to String. + *
+ * NOTE: caller MUST ensure input buffer has enough content.
*/
private final String _findDecodedFromSymbols(final int len) throws IOException
{
- if ((_inputEnd - _inputPtr) < len) {
- _loadToHaveAtLeast(len);
- }
// First: maybe we already have this name decoded?
if (len < 5) {
int inPtr = _inputPtr;
@@ -1762,13 +1786,13 @@ private final String _findDecodedFromSymbols(final int len) throws IOException
_quad3 = q3;
return _symbols.findName(q1, q2, q3);
}
- return _findDecodedLong(len, q1, q2);
+ return _findDecodedFixed12(len, q1, q2);
}
/**
- * Method for locating names longer than 8 bytes (in UTF-8)
+ * Method for locating names longer than 12 bytes (in UTF-8)
*/
- private final String _findDecodedLong(int len, int q1, int q2) throws IOException
+ private final String _findDecodedFixed12(int len, int q1, int q2) throws IOException
{
// first, need enough buffer to store bytes as ints:
{
diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBase.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBase.java
index d1afe5086..35a84f5d4 100644
--- a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBase.java
+++ b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBase.java
@@ -57,7 +57,7 @@ public abstract class SmileParserBase extends ParserMinimalBase
* I/O context for this reader. It handles buffer allocation
* for the reader.
*/
- final protected IOContext _ioContext;
+ protected final IOContext _ioContext;
/**
* Flag that indicates whether parser is closed or not. Gets
@@ -180,7 +180,7 @@ public abstract class SmileParserBase extends ParserMinimalBase
/**
* Symbol table that contains field names encountered so far
*/
- final protected ByteQuadsCanonicalizer _symbols;
+ protected final ByteQuadsCanonicalizer _symbols;
/**
* Temporary buffer used for name parsing.
@@ -210,6 +210,18 @@ public abstract class SmileParserBase extends ParserMinimalBase
protected int _seenStringValueCount = -1;
+ /**
+ * Marker flag to indicate that standard symbol handling is used
+ * (one with symbol table assisted canonicalization. May be disabled
+ * in which case alternate stream-line, non-canonicalizing handling
+ * is used: usually due to set of symbols
+ * (Object property names) is unbounded and will not benefit from
+ * canonicalization attempts.
+ *
+ * @since 2.13
+ */
+ protected final boolean _symbolsCanonical;
+
/*
/**********************************************************
/* Thread-local recycling
@@ -221,14 +233,14 @@ public abstract class SmileParserBase extends ParserMinimalBase
* to a buffer recycler used to provide a low-cost
* buffer recycling for Smile-specific buffers.
*/
- final protected static ThreadLocal