Skip to content

Commit

Permalink
Fix #252
Browse files Browse the repository at this point in the history
  • Loading branch information
cowtowncoder committed Mar 13, 2021
1 parent 10f9dcb commit c4ab92f
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 63 deletions.
1 change: 1 addition & 0 deletions release-notes/VERSION-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Modules:

#239: Should validate UTF-8 multi-byte validity for short decode path too
#248: Deprecate `CloseSafeUTF8Writer`, remove use
#252: Make `SmileFactory` support `JsonFactory.Feature.CANONICALIZE_FIELD_NAMES`
- `Ion-java` dep 1.4.0 -> 1.8.0

2.12.2 (03-Mar-2021)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,8 @@ public SmileGenerator createGenerator(OutputStream out) throws IOException {
@Override
public NonBlockingByteArrayParser createNonBlockingByteArrayParser() throws IOException {
IOContext ctxt = _createContext(null, false);
// 13-Mar-2021, tatu: [dataformats-binary#252] Leave async parser with
// always-canonicalizing, for now (2.13) -- to be improved in future
ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
return new NonBlockingByteArrayParser(ctxt, _parserFeatures, _smileParserFeatures, can);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -763,14 +763,8 @@ public String nextFieldName() throws IOException
}
case 2: // short ASCII
{
int len = 1 + (ch & 0x3f);
String name = _findDecodedFromSymbols(len);
if (name != null) {
_inputPtr += len;
} else {
name = _decodeShortAsciiName(len);
name = _addDecodedToSymbols(len, name);
}
final int len = 1 + (ch & 0x3f);
final String name = _findOrDecodeShortAsciiName(len);
if (_seenNames != null) {
if (_seenNameCount >= _seenNames.length) {
_seenNames = _expandSeenNames(_seenNames);
Expand All @@ -796,16 +790,10 @@ public String nextFieldName() throws IOException
}
} else {
final int len = ch + 2; // values from 2 to 57...
String name = _findDecodedFromSymbols(len);
if (name != null) {
_inputPtr += len;
} else {
name = _decodeShortUnicodeName(len);
name = _addDecodedToSymbols(len, name);
}
final String name = _findOrDecodeShortUnicodeName(len);
if (_seenNames != null) {
if (_seenNameCount >= _seenNames.length) {
_seenNames = _expandSeenNames(_seenNames);
_seenNames = _expandSeenNames(_seenNames);
}
_seenNames[_seenNameCount++] = name;
}
Expand Down Expand Up @@ -1329,14 +1317,8 @@ protected final JsonToken _handleFieldName() throws IOException
return JsonToken.FIELD_NAME;
case 2: // short ASCII
{
int len = 1 + (ch & 0x3f);
String name = _findDecodedFromSymbols(len);
if (name != null) {
_inputPtr += len;
} else {
name = _decodeShortAsciiName(len);
name = _addDecodedToSymbols(len, name);
}
final int len = 1 + (ch & 0x3f);
final String name = _findOrDecodeShortAsciiName(len);
if (_seenNames != null) {
if (_seenNameCount >= _seenNames.length) {
_seenNames = _expandSeenNames(_seenNames);
Expand All @@ -1360,16 +1342,10 @@ protected final JsonToken _handleFieldName() throws IOException
}
} else {
final int len = ch + 2; // values from 2 to 57...
String name = _findDecodedFromSymbols(len);
if (name != null) {
_inputPtr += len;
} else {
name = _decodeShortUnicodeName(len);
name = _addDecodedToSymbols(len, name);
}
final String name = _findOrDecodeShortUnicodeName(len);
if (_seenNames != null) {
if (_seenNameCount >= _seenNames.length) {
_seenNames = _expandSeenNames(_seenNames);
_seenNames = _expandSeenNames(_seenNames);
}
_seenNames[_seenNameCount++] = name;
}
Expand All @@ -1384,6 +1360,46 @@ protected final JsonToken _handleFieldName() throws IOException
return null;
}

private String _findOrDecodeShortAsciiName(final int len) throws IOException
{
// First things first: must ensure all in buffer
if ((_inputEnd - _inputPtr) < len) {
_loadToHaveAtLeast(len);
}
if (_symbolsCanonical) {
String name = _findDecodedFromSymbols(len);
if (name != null) {
_inputPtr += len;
} else {
name = _decodeShortAsciiName(len);
name = _addDecodedToSymbols(len, name);
}
return name;
}
// if not canonicalizing, much simpler:
return _decodeShortAsciiName(len);
}

private String _findOrDecodeShortUnicodeName(final int len) throws IOException
{
// First things first: must ensure all in buffer
if ((_inputEnd - _inputPtr) < len) {
_loadToHaveAtLeast(len);
}
if (_symbolsCanonical) {
String name = _findDecodedFromSymbols(len);
if (name != null) {
_inputPtr += len;
} else {
name = _decodeShortUnicodeName(len);
name = _addDecodedToSymbols(len, name);
}
return name;
}
// if not canonicalizing, much simpler:
return _decodeShortUnicodeName(len);
}

/**
* Method called to try to expand shared name area to fit one more potentially
* shared String. If area is already at its biggest size, will just clear
Expand Down Expand Up @@ -1474,7 +1490,6 @@ private final String _decodeShortUnicodeName(int len)
int outPtr = 0;
char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
int inPtr = _inputPtr;
_inputPtr += len;
final int[] codes = SmileConstants.sUtf8UnitLengths;
final byte[] inBuf = _inputBuffer;
for (int end = inPtr + len; inPtr < end; ) {
Expand Down Expand Up @@ -1502,16 +1517,21 @@ private final String _decodeShortUnicodeName(int len)
i = 0xDC00 | (i & 0x3FF);
break;
default: // invalid
_reportError("Invalid byte "+Integer.toHexString(i)+" in short Unicode text block");
// Update pointer here to point to (more) correct location
_inputPtr = inPtr;
_reportError("Invalid byte 0x"+Integer.toHexString(i)+" in short Unicode text block");
}
}
outBuf[outPtr++] = (char) i;
}
// let's only update offset here, so error message accurate
_inputPtr += len;
return _textBuffer.setCurrentAndReturn(outPtr);
}

// note: slightly edited copy of UTF8StreamParser.addName()
private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLen)
private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLen,
boolean addToSymbolTable)
throws IOException
{
int lastQuadBytes = byteLen & 3;
Expand Down Expand Up @@ -1611,7 +1631,10 @@ private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLe
if (lastQuadBytes > 0) {
quads[quadLen-1] = lastQuad;
}
return _symbols.addName(baseName, quads, quadLen);
if (addToSymbolTable) {
return _symbols.addName(baseName, quads, quadLen);
}
return baseName;
}

private final void _handleLongFieldName() throws IOException
Expand Down Expand Up @@ -1674,9 +1697,11 @@ private final void _handleLongFieldName() throws IOException
byteLen += bytes;
}
// Know this name already?
String name = _symbols.findName(_quadBuffer, quads);
String name = _symbolsCanonical ?
_symbols.findName(_quadBuffer, quads) : null;
if (name == null) {
name = _decodeLongUnicodeName(_quadBuffer, byteLen, quads);
name = _decodeLongUnicodeName(_quadBuffer, byteLen, quads,
_symbolsCanonical);
}
if (_seenNames != null) {
if (_seenNameCount >= _seenNames.length) {
Expand All @@ -1689,13 +1714,12 @@ private final void _handleLongFieldName() throws IOException

/**
* Helper method for trying to find specified encoded UTF-8 byte sequence
* from symbol table; if successful avoids actual decoding to String
* from symbol table; if successful avoids actual decoding to String.
*<p>
* NOTE: caller MUST ensure input buffer has enough content.
*/
private final String _findDecodedFromSymbols(final int len) throws IOException
{
if ((_inputEnd - _inputPtr) < len) {
_loadToHaveAtLeast(len);
}
// First: maybe we already have this name decoded?
if (len < 5) {
int inPtr = _inputPtr;
Expand Down Expand Up @@ -1762,13 +1786,13 @@ private final String _findDecodedFromSymbols(final int len) throws IOException
_quad3 = q3;
return _symbols.findName(q1, q2, q3);
}
return _findDecodedLong(len, q1, q2);
return _findDecodedFixed12(len, q1, q2);
}

/**
* Method for locating names longer than 8 bytes (in UTF-8)
* Method for locating names longer than 12 bytes (in UTF-8)
*/
private final String _findDecodedLong(int len, int q1, int q2) throws IOException
private final String _findDecodedFixed12(int len, int q1, int q2) throws IOException
{
// first, need enough buffer to store bytes as ints:
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public abstract class SmileParserBase extends ParserMinimalBase
* I/O context for this reader. It handles buffer allocation
* for the reader.
*/
final protected IOContext _ioContext;
protected final IOContext _ioContext;

/**
* Flag that indicates whether parser is closed or not. Gets
Expand Down Expand Up @@ -180,7 +180,7 @@ public abstract class SmileParserBase extends ParserMinimalBase
/**
* Symbol table that contains field names encountered so far
*/
final protected ByteQuadsCanonicalizer _symbols;
protected final ByteQuadsCanonicalizer _symbols;

/**
* Temporary buffer used for name parsing.
Expand Down Expand Up @@ -210,6 +210,18 @@ public abstract class SmileParserBase extends ParserMinimalBase

protected int _seenStringValueCount = -1;

/**
* Marker flag to indicate that standard symbol handling is used
* (one with symbol table assisted canonicalization. May be disabled
* in which case alternate stream-line, non-canonicalizing handling
* is used: usually due to set of symbols
* (Object property names) is unbounded and will not benefit from
* canonicalization attempts.
*
* @since 2.13
*/
protected final boolean _symbolsCanonical;

/*
/**********************************************************
/* Thread-local recycling
Expand All @@ -221,14 +233,14 @@ public abstract class SmileParserBase extends ParserMinimalBase
* to a buffer recycler used to provide a low-cost
* buffer recycling for Smile-specific buffers.
*/
final protected static ThreadLocal<SoftReference<SmileBufferRecycler<String>>> _smileRecyclerRef
protected final static ThreadLocal<SoftReference<SmileBufferRecycler<String>>> _smileRecyclerRef
= new ThreadLocal<SoftReference<SmileBufferRecycler<String>>>();

/**
* Helper object used for low-level recycling of Smile-generator
* specific buffers.
*/
final protected SmileBufferRecycler<String> _smileBufferRecycler;
protected final SmileBufferRecycler<String> _smileBufferRecycler;

/*
/**********************************************************
Expand All @@ -243,6 +255,7 @@ public SmileParserBase(IOContext ctxt, int parserFeatures, int formatFeatures,
_formatFeatures = formatFeatures;
_ioContext = ctxt;
_symbols = sym;
_symbolsCanonical = sym.isCanonicalizing();
DupDetector dups = Feature.STRICT_DUPLICATE_DETECTION.enabledIn(parserFeatures)
? DupDetector.rootDetector(this) : null;
_streamReadContext = JsonReadContext.createRootContext(dups);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,19 @@
public class SmileParserBootstrapper
{
/*
/**********************************************************
/**********************************************************************
/* Configuration
/**********************************************************
/**********************************************************************
*/

protected final IOContext _context;

protected final InputStream _in;

/*
/**********************************************************
/**********************************************************************
/* Input buffering
/**********************************************************
/**********************************************************************
*/

protected final byte[] _inputBuffer;
Expand All @@ -44,9 +44,9 @@ public class SmileParserBootstrapper
protected final boolean _bufferRecyclable;

/*
/**********************************************************
/**********************************************************************
/* Input location
/**********************************************************
/**********************************************************************
*/

/**
Expand All @@ -59,9 +59,9 @@ public class SmileParserBootstrapper
protected int _inputProcessed;

/*
/**********************************************************
/**********************************************************************
/* Life-cycle
/**********************************************************
/**********************************************************************
*/

public SmileParserBootstrapper(IOContext ctxt, InputStream in)
Expand Down Expand Up @@ -91,7 +91,9 @@ public SmileParser constructParser(int factoryFeatures,
ObjectCodec codec, ByteQuadsCanonicalizer rootByteSymbols)
throws IOException, JsonParseException
{
ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);
// 13-Mar-2021, tatu: [dataformats-binary#252] Create canonicalizing OR
// placeholder, depending on settings
ByteQuadsCanonicalizer can = rootByteSymbols.makeChildOrPlaceholder(factoryFeatures);
// We just need a single byte, really, to know if it starts with header
int end = _inputEnd;
if ((_inputPtr < end) && (_in != null)) {
Expand Down Expand Up @@ -136,9 +138,9 @@ public SmileParser constructParser(int factoryFeatures,
}

/*
/**********************************************************
/**********************************************************************
/* Encoding detection for data format auto-detection
/**********************************************************
/**********************************************************************
*/

public static MatchStrength hasSmileFormat(InputAccessor acc) throws IOException
Expand Down
Loading

0 comments on commit c4ab92f

Please sign in to comment.