Fix #252

FasterXML · Mar 13, 2021 · c4ab92f · c4ab92f
1 parent 10f9dcb
commit c4ab92f
Show file tree

Hide file tree

Showing 6 changed files with 177 additions and 63 deletions.
diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x
@@ -14,6 +14,7 @@ Modules:
 
 #239: Should validate UTF-8 multi-byte validity for short decode path too
 #248: Deprecate `CloseSafeUTF8Writer`, remove use
+#252: Make `SmileFactory` support `JsonFactory.Feature.CANONICALIZE_FIELD_NAMES`
 - `Ion-java` dep 1.4.0 -> 1.8.0
 
 2.12.2 (03-Mar-2021)

diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileFactory.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileFactory.java
@@ -412,6 +412,8 @@ public SmileGenerator createGenerator(OutputStream out) throws IOException {
     @Override
     public NonBlockingByteArrayParser createNonBlockingByteArrayParser() throws IOException {
         IOContext ctxt = _createContext(null, false);
+        // 13-Mar-2021, tatu: [dataformats-binary#252] Leave async parser with
+        //   always-canonicalizing, for now (2.13) -- to be improved in future
         ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
         return new NonBlockingByteArrayParser(ctxt, _parserFeatures, _smileParserFeatures, can);
     }

diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParser.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParser.java
@@ -763,14 +763,8 @@ public String nextFieldName() throws IOException
                 }
             case 2: // short ASCII
                 {
-                    int len = 1 + (ch & 0x3f);
-                    String name = _findDecodedFromSymbols(len);
-                    if (name != null) {
-                        _inputPtr += len;
-                    } else {
-                        name = _decodeShortAsciiName(len);
-                        name = _addDecodedToSymbols(len, name);
-                    }
+                    final int len = 1 + (ch & 0x3f);
+                    final String name = _findOrDecodeShortAsciiName(len);
                     if (_seenNames != null) {
                         if (_seenNameCount >= _seenNames.length) {
                             _seenNames = _expandSeenNames(_seenNames);
@@ -796,16 +790,10 @@ public String nextFieldName() throws IOException
                         }
                     } else {
                         final int len = ch + 2; // values from 2 to 57...
-                        String name = _findDecodedFromSymbols(len);
-                        if (name != null) {
-                            _inputPtr += len;
-                        } else {
-                            name = _decodeShortUnicodeName(len);
-                            name = _addDecodedToSymbols(len, name);
-                        }
+                        final String name = _findOrDecodeShortUnicodeName(len);
                         if (_seenNames != null) {
                             if (_seenNameCount >= _seenNames.length) {
-                             _seenNames = _expandSeenNames(_seenNames);
+                                _seenNames = _expandSeenNames(_seenNames);
                             }
                             _seenNames[_seenNameCount++] = name;
                         }
@@ -1329,14 +1317,8 @@ protected final JsonToken _handleFieldName() throws IOException
             return JsonToken.FIELD_NAME;
         case 2: // short ASCII
             {
-                int len = 1 + (ch & 0x3f);
-                String name = _findDecodedFromSymbols(len);
-                if (name != null) {
-                    _inputPtr += len;
-                } else {
-                    name = _decodeShortAsciiName(len);
-                    name = _addDecodedToSymbols(len, name);
-                }
+                final int len = 1 + (ch & 0x3f);
+                final String name = _findOrDecodeShortAsciiName(len);
                 if (_seenNames != null) {
                     if (_seenNameCount >= _seenNames.length) {
                         _seenNames = _expandSeenNames(_seenNames);
@@ -1360,16 +1342,10 @@ protected final JsonToken _handleFieldName() throws IOException
                     }
                 } else {
                     final int len = ch + 2; // values from 2 to 57...
-                    String name = _findDecodedFromSymbols(len);
-                    if (name != null) {
-                        _inputPtr += len;
-                    } else {
-                        name = _decodeShortUnicodeName(len);
-                        name = _addDecodedToSymbols(len, name);
-                    }
+                    final String name = _findOrDecodeShortUnicodeName(len);
                     if (_seenNames != null) {
                         if (_seenNameCount >= _seenNames.length) {
-    	                    _seenNames = _expandSeenNames(_seenNames);
+                            _seenNames = _expandSeenNames(_seenNames);
                         }
                         _seenNames[_seenNameCount++] = name;
                     }
@@ -1384,6 +1360,46 @@ protected final JsonToken _handleFieldName() throws IOException
         return null;
     }
 
+    private String _findOrDecodeShortAsciiName(final int len) throws IOException
+    {
+        // First things first: must ensure all in buffer
+        if ((_inputEnd - _inputPtr) < len) {
+            _loadToHaveAtLeast(len);
+        }
+        if (_symbolsCanonical) {
+            String name = _findDecodedFromSymbols(len);
+            if (name != null) {
+                _inputPtr += len;
+            } else {
+                name = _decodeShortAsciiName(len);
+                name = _addDecodedToSymbols(len, name);
+            }
+            return name;
+        }
+        // if not canonicalizing, much simpler:
+        return _decodeShortAsciiName(len);
+    }
+
+    private String _findOrDecodeShortUnicodeName(final int len) throws IOException
+    {
+        // First things first: must ensure all in buffer
+        if ((_inputEnd - _inputPtr) < len) {
+            _loadToHaveAtLeast(len);
+        }
+        if (_symbolsCanonical) {
+            String name = _findDecodedFromSymbols(len);
+            if (name != null) {
+                _inputPtr += len;
+            } else {
+                name = _decodeShortUnicodeName(len);
+                name = _addDecodedToSymbols(len, name);
+            }
+            return name;
+        }
+        // if not canonicalizing, much simpler:
+        return _decodeShortUnicodeName(len);
+    }
+
     /**
      * Method called to try to expand shared name area to fit one more potentially
      * shared String. If area is already at its biggest size, will just clear
@@ -1474,7 +1490,6 @@ private final String _decodeShortUnicodeName(int len)
         int outPtr = 0;
         char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
         int inPtr = _inputPtr;
-        _inputPtr += len;
         final int[] codes = SmileConstants.sUtf8UnitLengths;
         final byte[] inBuf = _inputBuffer;
         for (int end = inPtr + len; inPtr < end; ) {
@@ -1502,16 +1517,21 @@ private final String _decodeShortUnicodeName(int len)
                     i = 0xDC00 | (i & 0x3FF);
                     break;
                 default: // invalid
-                    _reportError("Invalid byte "+Integer.toHexString(i)+" in short Unicode text block");
+                    // Update pointer here to point to (more) correct location
+                    _inputPtr = inPtr;
+                    _reportError("Invalid byte 0x"+Integer.toHexString(i)+" in short Unicode text block");
                 }
             }
             outBuf[outPtr++] = (char) i;
         }
+        // let's only update offset here, so error message accurate
+        _inputPtr += len;
         return _textBuffer.setCurrentAndReturn(outPtr);
     }
 
     // note: slightly edited copy of UTF8StreamParser.addName()
-    private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLen)
+    private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLen,
+            boolean addToSymbolTable)
         throws IOException
     {
         int lastQuadBytes = byteLen & 3;
@@ -1611,7 +1631,10 @@ private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLe
         if (lastQuadBytes > 0) {
             quads[quadLen-1] = lastQuad;
         }
-        return _symbols.addName(baseName, quads, quadLen);
+        if (addToSymbolTable) {
+            return _symbols.addName(baseName, quads, quadLen);
+        }
+        return baseName;
     }
 
     private final void _handleLongFieldName() throws IOException
@@ -1674,9 +1697,11 @@ private final void _handleLongFieldName() throws IOException
             byteLen += bytes;
         }
         // Know this name already?
-        String name = _symbols.findName(_quadBuffer, quads);
+        String name = _symbolsCanonical ? 
+            _symbols.findName(_quadBuffer, quads) : null;
         if (name == null) {
-            name = _decodeLongUnicodeName(_quadBuffer, byteLen, quads);
+            name = _decodeLongUnicodeName(_quadBuffer, byteLen, quads,
+                    _symbolsCanonical);
         }
         if (_seenNames != null) {
            if (_seenNameCount >= _seenNames.length) {
@@ -1689,13 +1714,12 @@ private final void _handleLongFieldName() throws IOException
 
     /**
      * Helper method for trying to find specified encoded UTF-8 byte sequence
-     * from symbol table; if successful avoids actual decoding to String
+     * from symbol table; if successful avoids actual decoding to String.
+     *<p>
+     * NOTE: caller MUST ensure input buffer has enough content.
      */
     private final String _findDecodedFromSymbols(final int len) throws IOException
     {
-        if ((_inputEnd - _inputPtr) < len) {
-            _loadToHaveAtLeast(len);
-        }
         // First: maybe we already have this name decoded?
         if (len < 5) {
             int inPtr = _inputPtr;
@@ -1762,13 +1786,13 @@ private final String _findDecodedFromSymbols(final int len) throws IOException
             _quad3 = q3;
             return _symbols.findName(q1, q2, q3);
         }
-        return _findDecodedLong(len, q1, q2);
+        return _findDecodedFixed12(len, q1, q2);
     }
 
     /**
-     * Method for locating names longer than 8 bytes (in UTF-8)
+     * Method for locating names longer than 12 bytes (in UTF-8)
      */
-    private final String _findDecodedLong(int len, int q1, int q2) throws IOException
+    private final String _findDecodedFixed12(int len, int q1, int q2) throws IOException
     {
         // first, need enough buffer to store bytes as ints:
         {

diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBase.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBase.java
@@ -57,7 +57,7 @@ public abstract class SmileParserBase extends ParserMinimalBase
      * I/O context for this reader. It handles buffer allocation
      * for the reader.
      */
-    final protected IOContext _ioContext;
+    protected final IOContext _ioContext;
 
     /**
      * Flag that indicates whether parser is closed or not. Gets
@@ -180,7 +180,7 @@ public abstract class SmileParserBase extends ParserMinimalBase
     /**
      * Symbol table that contains field names encountered so far
      */
-    final protected ByteQuadsCanonicalizer _symbols;
+    protected final ByteQuadsCanonicalizer _symbols;
 
     /**
      * Temporary buffer used for name parsing.
@@ -210,6 +210,18 @@ public abstract class SmileParserBase extends ParserMinimalBase
 
     protected int _seenStringValueCount = -1;
 
+    /**
+     * Marker flag to indicate that standard symbol handling is used
+     * (one with symbol table assisted canonicalization. May be disabled
+     * in which case alternate stream-line, non-canonicalizing handling
+     * is used: usually due to set of symbols
+     * (Object property names) is unbounded and will not benefit from
+     * canonicalization attempts.
+     *
+     * @since 2.13
+     */
+    protected final boolean _symbolsCanonical;
+
     /*
     /**********************************************************
     /* Thread-local recycling
@@ -221,14 +233,14 @@ public abstract class SmileParserBase extends ParserMinimalBase
      * to a buffer recycler used to provide a low-cost
      * buffer recycling for Smile-specific buffers.
      */
-    final protected static ThreadLocal<SoftReference<SmileBufferRecycler<String>>> _smileRecyclerRef
+    protected final static ThreadLocal<SoftReference<SmileBufferRecycler<String>>> _smileRecyclerRef
         = new ThreadLocal<SoftReference<SmileBufferRecycler<String>>>();
 
     /**
      * Helper object used for low-level recycling of Smile-generator
      * specific buffers.
      */
-    final protected SmileBufferRecycler<String> _smileBufferRecycler;
+    protected final SmileBufferRecycler<String> _smileBufferRecycler;
 
     /*
     /**********************************************************
@@ -243,6 +255,7 @@ public SmileParserBase(IOContext ctxt, int parserFeatures, int formatFeatures,
         _formatFeatures = formatFeatures;
         _ioContext = ctxt;
         _symbols = sym;
+        _symbolsCanonical = sym.isCanonicalizing();
         DupDetector dups = Feature.STRICT_DUPLICATE_DETECTION.enabledIn(parserFeatures)
                 ? DupDetector.rootDetector(this) : null;
         _streamReadContext = JsonReadContext.createRootContext(dups);

diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBootstrapper.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileParserBootstrapper.java
@@ -16,19 +16,19 @@
 public class SmileParserBootstrapper
 {
     /*
-    /**********************************************************
+    /**********************************************************************
     /* Configuration
-    /**********************************************************
+    /**********************************************************************
      */
 
     protected final IOContext _context;
 
     protected final InputStream _in;
 
     /*
-    /**********************************************************
+    /**********************************************************************
     /* Input buffering
-    /**********************************************************
+    /**********************************************************************
      */
 
     protected final byte[] _inputBuffer;
@@ -44,9 +44,9 @@ public class SmileParserBootstrapper
     protected final boolean _bufferRecyclable;
 
     /*
-    /**********************************************************
+    /**********************************************************************
     /* Input location
-    /**********************************************************
+    /**********************************************************************
      */
 
     /**
@@ -59,9 +59,9 @@ public class SmileParserBootstrapper
     protected int _inputProcessed;
 
     /*
-    /**********************************************************
+    /**********************************************************************
     /* Life-cycle
-    /**********************************************************
+    /**********************************************************************
      */
 
     public SmileParserBootstrapper(IOContext ctxt, InputStream in)
@@ -91,7 +91,9 @@ public SmileParser constructParser(int factoryFeatures,
             ObjectCodec codec, ByteQuadsCanonicalizer rootByteSymbols)
         throws IOException, JsonParseException
     {
-        ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);
+        // 13-Mar-2021, tatu: [dataformats-binary#252] Create canonicalizing OR
+        //    placeholder, depending on settings
+        ByteQuadsCanonicalizer can = rootByteSymbols.makeChildOrPlaceholder(factoryFeatures);
         // We just need a single byte, really, to know if it starts with header
         int end = _inputEnd;
         if ((_inputPtr < end) && (_in != null)) {
@@ -136,9 +138,9 @@ public SmileParser constructParser(int factoryFeatures,
     }
 
     /*
-    /**********************************************************
+    /**********************************************************************
     /*  Encoding detection for data format auto-detection
-    /**********************************************************
+    /**********************************************************************
      */
 
     public static MatchStrength hasSmileFormat(InputAccessor acc) throws IOException