diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x index 901c5e723..9ab954a0a 100644 --- a/release-notes/VERSION-2.x +++ b/release-notes/VERSION-2.x @@ -14,6 +14,7 @@ Modules: #239: Should validate UTF-8 multi-byte validity for short decode path too #248: Deprecate `CloseSafeUTF8Writer`, remove use +#252: Make `SmileFactory` support `JsonFactory.Feature.CANONICALIZE_FIELD_NAMES` - `Ion-java` dep 1.4.0 -> 1.8.0 2.12.2 (03-Mar-2021) diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileFactory.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileFactory.java index e98d729cc..fdf419ae5 100644 --- a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileFactory.java +++ b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileFactory.java @@ -412,6 +412,8 @@ public SmileGenerator createGenerator(OutputStream out) throws IOException { @Override public NonBlockingByteArrayParser createNonBlockingByteArrayParser() throws IOException { IOContext ctxt = _createContext(null, false); + // 13-Mar-2021, tatu: [dataformats-binary#252] Leave async parser with + // always-canonicalizing, for now (2.13) -- to be improved in future ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures); return new NonBlockingByteArrayParser(ctxt, _parserFeatures, _smileParserFeatures, can); } diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParser.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParser.java index 572396ab7..0590c339a 100644 --- a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParser.java +++ b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParser.java @@ -763,14 +763,8 @@ public String nextFieldName() throws IOException } case 2: // short ASCII { - int len = 1 + (ch & 0x3f); - String name = _findDecodedFromSymbols(len); - if (name != null) { - _inputPtr += len; - } else { - name = _decodeShortAsciiName(len); - name = _addDecodedToSymbols(len, name); - } + final int len = 1 + (ch & 0x3f); + final String name = _findOrDecodeShortAsciiName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); @@ -796,16 +790,10 @@ public String nextFieldName() throws IOException } } else { final int len = ch + 2; // values from 2 to 57... - String name = _findDecodedFromSymbols(len); - if (name != null) { - _inputPtr += len; - } else { - name = _decodeShortUnicodeName(len); - name = _addDecodedToSymbols(len, name); - } + final String name = _findOrDecodeShortUnicodeName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { - _seenNames = _expandSeenNames(_seenNames); + _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } @@ -1329,14 +1317,8 @@ protected final JsonToken _handleFieldName() throws IOException return JsonToken.FIELD_NAME; case 2: // short ASCII { - int len = 1 + (ch & 0x3f); - String name = _findDecodedFromSymbols(len); - if (name != null) { - _inputPtr += len; - } else { - name = _decodeShortAsciiName(len); - name = _addDecodedToSymbols(len, name); - } + final int len = 1 + (ch & 0x3f); + final String name = _findOrDecodeShortAsciiName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); @@ -1360,16 +1342,10 @@ protected final JsonToken _handleFieldName() throws IOException } } else { final int len = ch + 2; // values from 2 to 57... - String name = _findDecodedFromSymbols(len); - if (name != null) { - _inputPtr += len; - } else { - name = _decodeShortUnicodeName(len); - name = _addDecodedToSymbols(len, name); - } + final String name = _findOrDecodeShortUnicodeName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { - _seenNames = _expandSeenNames(_seenNames); + _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } @@ -1384,6 +1360,46 @@ protected final JsonToken _handleFieldName() throws IOException return null; } + private String _findOrDecodeShortAsciiName(final int len) throws IOException + { + // First things first: must ensure all in buffer + if ((_inputEnd - _inputPtr) < len) { + _loadToHaveAtLeast(len); + } + if (_symbolsCanonical) { + String name = _findDecodedFromSymbols(len); + if (name != null) { + _inputPtr += len; + } else { + name = _decodeShortAsciiName(len); + name = _addDecodedToSymbols(len, name); + } + return name; + } + // if not canonicalizing, much simpler: + return _decodeShortAsciiName(len); + } + + private String _findOrDecodeShortUnicodeName(final int len) throws IOException + { + // First things first: must ensure all in buffer + if ((_inputEnd - _inputPtr) < len) { + _loadToHaveAtLeast(len); + } + if (_symbolsCanonical) { + String name = _findDecodedFromSymbols(len); + if (name != null) { + _inputPtr += len; + } else { + name = _decodeShortUnicodeName(len); + name = _addDecodedToSymbols(len, name); + } + return name; + } + // if not canonicalizing, much simpler: + return _decodeShortUnicodeName(len); + } + /** * Method called to try to expand shared name area to fit one more potentially * shared String. If area is already at its biggest size, will just clear @@ -1474,7 +1490,6 @@ private final String _decodeShortUnicodeName(int len) int outPtr = 0; char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); int inPtr = _inputPtr; - _inputPtr += len; final int[] codes = SmileConstants.sUtf8UnitLengths; final byte[] inBuf = _inputBuffer; for (int end = inPtr + len; inPtr < end; ) { @@ -1502,16 +1517,21 @@ private final String _decodeShortUnicodeName(int len) i = 0xDC00 | (i & 0x3FF); break; default: // invalid - _reportError("Invalid byte "+Integer.toHexString(i)+" in short Unicode text block"); + // Update pointer here to point to (more) correct location + _inputPtr = inPtr; + _reportError("Invalid byte 0x"+Integer.toHexString(i)+" in short Unicode text block"); } } outBuf[outPtr++] = (char) i; } + // let's only update offset here, so error message accurate + _inputPtr += len; return _textBuffer.setCurrentAndReturn(outPtr); } // note: slightly edited copy of UTF8StreamParser.addName() - private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLen) + private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLen, + boolean addToSymbolTable) throws IOException { int lastQuadBytes = byteLen & 3; @@ -1611,7 +1631,10 @@ private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLe if (lastQuadBytes > 0) { quads[quadLen-1] = lastQuad; } - return _symbols.addName(baseName, quads, quadLen); + if (addToSymbolTable) { + return _symbols.addName(baseName, quads, quadLen); + } + return baseName; } private final void _handleLongFieldName() throws IOException @@ -1674,9 +1697,11 @@ private final void _handleLongFieldName() throws IOException byteLen += bytes; } // Know this name already? - String name = _symbols.findName(_quadBuffer, quads); + String name = _symbolsCanonical ? + _symbols.findName(_quadBuffer, quads) : null; if (name == null) { - name = _decodeLongUnicodeName(_quadBuffer, byteLen, quads); + name = _decodeLongUnicodeName(_quadBuffer, byteLen, quads, + _symbolsCanonical); } if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { @@ -1689,13 +1714,12 @@ private final void _handleLongFieldName() throws IOException /** * Helper method for trying to find specified encoded UTF-8 byte sequence - * from symbol table; if successful avoids actual decoding to String + * from symbol table; if successful avoids actual decoding to String. + *

+ * NOTE: caller MUST ensure input buffer has enough content. */ private final String _findDecodedFromSymbols(final int len) throws IOException { - if ((_inputEnd - _inputPtr) < len) { - _loadToHaveAtLeast(len); - } // First: maybe we already have this name decoded? if (len < 5) { int inPtr = _inputPtr; @@ -1762,13 +1786,13 @@ private final String _findDecodedFromSymbols(final int len) throws IOException _quad3 = q3; return _symbols.findName(q1, q2, q3); } - return _findDecodedLong(len, q1, q2); + return _findDecodedFixed12(len, q1, q2); } /** - * Method for locating names longer than 8 bytes (in UTF-8) + * Method for locating names longer than 12 bytes (in UTF-8) */ - private final String _findDecodedLong(int len, int q1, int q2) throws IOException + private final String _findDecodedFixed12(int len, int q1, int q2) throws IOException { // first, need enough buffer to store bytes as ints: { diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBase.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBase.java index d1afe5086..35a84f5d4 100644 --- a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBase.java +++ b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBase.java @@ -57,7 +57,7 @@ public abstract class SmileParserBase extends ParserMinimalBase * I/O context for this reader. It handles buffer allocation * for the reader. */ - final protected IOContext _ioContext; + protected final IOContext _ioContext; /** * Flag that indicates whether parser is closed or not. Gets @@ -180,7 +180,7 @@ public abstract class SmileParserBase extends ParserMinimalBase /** * Symbol table that contains field names encountered so far */ - final protected ByteQuadsCanonicalizer _symbols; + protected final ByteQuadsCanonicalizer _symbols; /** * Temporary buffer used for name parsing. @@ -210,6 +210,18 @@ public abstract class SmileParserBase extends ParserMinimalBase protected int _seenStringValueCount = -1; + /** + * Marker flag to indicate that standard symbol handling is used + * (one with symbol table assisted canonicalization. May be disabled + * in which case alternate stream-line, non-canonicalizing handling + * is used: usually due to set of symbols + * (Object property names) is unbounded and will not benefit from + * canonicalization attempts. + * + * @since 2.13 + */ + protected final boolean _symbolsCanonical; + /* /********************************************************** /* Thread-local recycling @@ -221,14 +233,14 @@ public abstract class SmileParserBase extends ParserMinimalBase * to a buffer recycler used to provide a low-cost * buffer recycling for Smile-specific buffers. */ - final protected static ThreadLocal>> _smileRecyclerRef + protected final static ThreadLocal>> _smileRecyclerRef = new ThreadLocal>>(); /** * Helper object used for low-level recycling of Smile-generator * specific buffers. */ - final protected SmileBufferRecycler _smileBufferRecycler; + protected final SmileBufferRecycler _smileBufferRecycler; /* /********************************************************** @@ -243,6 +255,7 @@ public SmileParserBase(IOContext ctxt, int parserFeatures, int formatFeatures, _formatFeatures = formatFeatures; _ioContext = ctxt; _symbols = sym; + _symbolsCanonical = sym.isCanonicalizing(); DupDetector dups = Feature.STRICT_DUPLICATE_DETECTION.enabledIn(parserFeatures) ? DupDetector.rootDetector(this) : null; _streamReadContext = JsonReadContext.createRootContext(dups); diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBootstrapper.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBootstrapper.java index aef3e66ac..19615a710 100644 --- a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBootstrapper.java +++ b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBootstrapper.java @@ -16,9 +16,9 @@ public class SmileParserBootstrapper { /* - /********************************************************** + /********************************************************************** /* Configuration - /********************************************************** + /********************************************************************** */ protected final IOContext _context; @@ -26,9 +26,9 @@ public class SmileParserBootstrapper protected final InputStream _in; /* - /********************************************************** + /********************************************************************** /* Input buffering - /********************************************************** + /********************************************************************** */ protected final byte[] _inputBuffer; @@ -44,9 +44,9 @@ public class SmileParserBootstrapper protected final boolean _bufferRecyclable; /* - /********************************************************** + /********************************************************************** /* Input location - /********************************************************** + /********************************************************************** */ /** @@ -59,9 +59,9 @@ public class SmileParserBootstrapper protected int _inputProcessed; /* - /********************************************************** + /********************************************************************** /* Life-cycle - /********************************************************** + /********************************************************************** */ public SmileParserBootstrapper(IOContext ctxt, InputStream in) @@ -91,7 +91,9 @@ public SmileParser constructParser(int factoryFeatures, ObjectCodec codec, ByteQuadsCanonicalizer rootByteSymbols) throws IOException, JsonParseException { - ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures); + // 13-Mar-2021, tatu: [dataformats-binary#252] Create canonicalizing OR + // placeholder, depending on settings + ByteQuadsCanonicalizer can = rootByteSymbols.makeChildOrPlaceholder(factoryFeatures); // We just need a single byte, really, to know if it starts with header int end = _inputEnd; if ((_inputPtr < end) && (_in != null)) { @@ -136,9 +138,9 @@ public SmileParser constructParser(int factoryFeatures, } /* - /********************************************************** + /********************************************************************** /* Encoding detection for data format auto-detection - /********************************************************** + /********************************************************************** */ public static MatchStrength hasSmileFormat(InputAccessor acc) throws IOException diff --git a/smile/src/test/java/com/fasterxml/jackson/dataformat/smile/parse/SymbolTableTest.java b/smile/src/test/java/com/fasterxml/jackson/dataformat/smile/parse/SymbolTableTest.java index dbf1d0009..319e4a8ee 100644 --- a/smile/src/test/java/com/fasterxml/jackson/dataformat/smile/parse/SymbolTableTest.java +++ b/smile/src/test/java/com/fasterxml/jackson/dataformat/smile/parse/SymbolTableTest.java @@ -1,6 +1,7 @@ package com.fasterxml.jackson.dataformat.smile.parse; import java.lang.reflect.Field; +import java.util.Random; import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonParser; @@ -22,6 +23,8 @@ public void testSimpleDefault() throws Exception // First: should have empty symbol table try (JsonParser p = vanillaMapper.createParser(doc)) { ByteQuadsCanonicalizer syms = _findSymbols(p); + assertTrue(syms.isCanonicalizing()); // added in 2.13 + assertEquals(0, syms.size()); assertEquals(0, _findParent(syms).size()); @@ -67,11 +70,25 @@ public void testSimpleDefault() throws Exception } } - // !!! TODO: // [dataformats-binary#252]: should be able to prevent canonicalization + // Assumption: there is still non-null symbol table, but has "no canonicalization" public void testSimpleNoCanonicalize() throws Exception { - final byte[] doc = _smileDoc("{\"a\":1,\"b\":2}"); + final String[] fieldNames = new String[] { + // Ascii, various lengths + "abc", "abcd123", "abcdefghi123940963", "", + // Unicode, also (2-byte ones ought to be ok) + "F\u00F6\u00F6", "F\u00F6\u00F6bar", "Longer F\u00F6\u00F6bar", + + // and then couple of longer names; total needs to exceed 64k + generateName(77), + generateName(2000), + generateName(17000), + generateName(23000), + generateName(33033), + "end", // just simple end marker + }; + final byte[] doc = _smileDoc(jsonFrom(fieldNames)); final SmileMapper mapper = SmileMapper.builder(SmileFactory.builder() .disable(JsonFactory.Feature.CANONICALIZE_FIELD_NAMES) .build()) @@ -79,8 +96,63 @@ public void testSimpleNoCanonicalize() throws Exception try (JsonParser p = mapper.createParser(doc)) { ByteQuadsCanonicalizer syms = _findSymbols(p); - assertEquals(0, syms.size()); + assertFalse(syms.isCanonicalizing()); // added in 2.13 + assertEquals(-1, syms.size()); + // also, should not have parent: + assertNull(_findParent(syms)); + + assertToken(JsonToken.START_OBJECT, p.nextToken()); + assertToken(JsonToken.FIELD_NAME, p.nextToken()); + assertEquals(fieldNames[0], p.currentName()); + // should NOT add + assertEquals(-1, syms.size()); + assertToken(JsonToken.VALUE_NUMBER_INT, p.nextToken()); + assertEquals(0, p.getIntValue()); + + // and from thereon... + for (int i = 1; i < fieldNames.length; ++i) { + assertToken(JsonToken.FIELD_NAME, p.nextToken()); + assertEquals(fieldNames[i], p.currentName()); + assertToken(JsonToken.VALUE_NUMBER_INT, p.nextToken()); + assertEquals(i, p.getIntValue()); + } + assertToken(JsonToken.END_OBJECT, p.nextToken()); + assertNull(p.nextToken()); + + assertEquals(-1, syms.size()); + } + } + + private String jsonFrom(String... fields) { + StringBuilder sb = new StringBuilder(); + sb.append("{"); + + for (int i = 0, len = fields.length; i < len; ++i) { + if (i > 0) { + sb.append(",\n"); + } + sb.append('"').append(fields[i]).append('"') + .append(" : ").append(i); + } + sb.append("}"); + return sb.toString(); + } + + private String generateName(int minLen) + { + StringBuilder sb = new StringBuilder(); + Random rnd = new Random(123); + while (sb.length() < minLen) { + int ch = rnd.nextInt(96); + if (ch < 32) { // ascii (single byte) + sb.append((char) (48 + ch)); + } else if (ch < 64) { // 2 byte + sb.append((char) (128 + ch)); + } else { // 3 byte + sb.append((char) (4000 + ch)); + } } + return sb.toString(); } // Helper method to dig up symbol table reference for tests, without