diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
index bab16426f7..cdace9c1cc 100644
--- a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
+++ b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
@@ -50,15 +50,15 @@ namespace Lucene.Net.Codecs
public static class BlockTreeTermsWriter
{
///
- /// Suggested default value for the
- /// minItemsInBlock parameter to
+ /// Suggested default value for the
+ /// minItemsInBlock parameter to
/// .
///
public const int DEFAULT_MIN_BLOCK_SIZE = 25;
///
- /// Suggested default value for the
- /// maxItemsInBlock parameter to
+ /// Suggested default value for the
+ /// maxItemsInBlock parameter to
/// .
///
public const int DEFAULT_MAX_BLOCK_SIZE = 48;
@@ -296,12 +296,12 @@ public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long
/// to set state. It is *optional* and can be used when overriding the WriteHeader(),
/// WriteIndexHeader(). It only matters in the case where the state
/// is required inside of any of those methods that is passed in to the subclass constructor.
- ///
+ ///
/// When passed to the constructor, it is set to the protected field m_subclassState before
/// any of the above methods are called where it is available for reading when overriding the above methods.
- ///
+ ///
/// If your subclass needs to pass more than one piece of data, you can create a class or struct to do so.
- /// All other virtual members of BlockTreeTermsWriter are not called in the constructor,
+ /// All other virtual members of BlockTreeTermsWriter are not called in the constructor,
/// so the overrides of those methods won't specifically need to use this field (although they could for consistency).
///
[SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")]
@@ -468,7 +468,20 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f
public override string ToString()
{
- return "BLOCK: " + Prefix.Utf8ToString();
+ return $"BLOCK: {Prefix.Utf8ToString()}";
+ }
+
+ #nullable enable
+ public bool TryToString([NotNullWhen(true)] out string? result)
+ {
+ if (Prefix.TryUtf8ToString(out string? prefixString))
+ {
+ result = $"BLOCK: {prefixString}";
+ return true;
+ }
+
+ result = null;
+ return false;
}
// LUCENENET specific - to keep the Debug.Assert statement from throwing exceptions
@@ -476,12 +489,11 @@ public override string ToString()
// to using PendingBlock.Prefix.ToString() if PendingBlock.ToString() errors.
// This struct defers formatting the string until it is actually used as a parameter
// in string.Format().
- private struct PendingBlocksFormatter // For assert
+ private readonly struct PendingBlocksFormatter // For assert
{
-#pragma warning disable IDE0044 // Add readonly modifier
- private IList blocks;
-#pragma warning restore IDE0044 // Add readonly modifier
- public PendingBlocksFormatter(IList blocks)
+ private readonly IList? blocks;
+
+ public PendingBlocksFormatter(IList? blocks)
{
this.blocks = blocks; // May be null
}
@@ -500,17 +512,17 @@ public override string ToString() // For assert
it.MoveNext();
while (true)
{
- var e = it.Current;
+ var e = it.Current ?? throw new InvalidOperationException("Expected a non-null value in the enumerator due to Count check above.");
// There is a chance that the Prefix will contain invalid UTF8,
// so we catch that and use the alternative way of displaying it
- try
+ if (e.TryToString(out string? eString))
{
- sb.Append(e.ToString());
+ sb.Append(eString);
}
- catch (IndexOutOfRangeException)
+ else
{
sb.Append("BLOCK: ");
- sb.Append(e.Prefix.ToString());
+ sb.Append(e.Prefix);
}
if (!it.MoveNext())
{
@@ -520,6 +532,7 @@ public override string ToString() // For assert
}
}
}
+ #nullable restore
public void CompileIndex(IList floorBlocks, RAMOutputStream scratchBytes)
{
@@ -1351,4 +1364,4 @@ protected override void Dispose(bool disposing)
}
}
}
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs
index 8012c77282..2e3679f8e0 100644
--- a/src/Lucene.Net/Util/BytesRef.cs
+++ b/src/Lucene.Net/Util/BytesRef.cs
@@ -243,6 +243,26 @@ public string Utf8ToString()
return @ref.ToString();
}
+ #nullable enable
+ ///
+ /// Tries to interpret the stored bytes as UTF8 bytes, returning the
+ /// resulting as an output parameter .
+ ///
+ /// The resulting string output.
+ /// true if successful, false otherwise.
+ public bool TryUtf8ToString([NotNullWhen(true)] out string? result)
+ {
+ if (UnicodeUtil.TryUTF8toUTF16(bytes, Offset, Length, out CharsRef? @ref))
+ {
+ result = @ref.ToString();
+ return true;
+ }
+
+ result = null;
+ return false;
+ }
+ #nullable restore
+
///
/// Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65]
public override string ToString()
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 5974af1a16..65dd2fabc9 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -3,6 +3,7 @@
using Lucene.Net.Diagnostics;
using Lucene.Net.Support;
using System;
+using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Text;
@@ -123,13 +124,13 @@ public static class UnicodeUtil
private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
///
- /// Encode characters from a , starting at
+ /// Encode characters from a (with generic type argument ) , starting at
/// and ending at . After encoding, result.Offset will always be 0.
///
/// is null.
// TODO: broken if incoming result.offset != 0
// LUCENENET specific overload
- public static void UTF16toUTF8(Span source, BytesRef result)
+ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result)
{
// LUCENENET: Added guard clause
if (result is null)
@@ -200,7 +201,7 @@ public static void UTF16toUTF8(Span source, BytesRef result)
}
///
- /// Encode characters from a , starting at
+ /// Encode characters from a (with generic type argument ) , starting at
/// for chars. After encoding, result.Offset will always be 0.
///
/// or is null.
@@ -212,11 +213,9 @@ public static void UTF16toUTF8(Span source, BytesRef result)
/// and refer to a location outside of .
///
// TODO: broken if incoming result.offset != 0
- public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result)
+ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length, BytesRef result)
{
// LUCENENET: Added guard clauses
- if (source is null)
- throw new ArgumentNullException(nameof(source));
if (result is null)
throw new ArgumentNullException(nameof(result));
if (offset < 0)
@@ -633,7 +632,7 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl
return true;
}
- public static bool ValidUTF16String(char[] s, int size)
+ public static bool ValidUTF16String(ReadOnlySpan s, int size)
{
for (int i = 0; i < size; i++)
{
@@ -828,16 +827,16 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
///
- /// Cover JDK 1.5 API. Create a String from an array of .
+ /// Cover JDK 1.5 API. Create a String from a span of .
///
- /// The code array.
- /// The start of the text in the code point array.
+ /// The code point span.
+ /// The start of the text in the code point span.
/// The number of code points.
/// a String representing the code points between offset and count.
/// If an invalid code point is encountered.
/// If the offset or count are out of bounds.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static string NewString(int[] codePoints, int offset, int count)
+ public static string NewString(ReadOnlySpan codePoints, int offset, int count)
{
// LUCENENET: Character.ToString() was optimized to use the stack for arrays
// of codepoints 256 or less, so it performs better than using ToCharArray().
@@ -849,26 +848,26 @@ public static string NewString(int[] codePoints, int offset, int count)
///
/// LUCENENET specific.
///
- /// The code array.
- /// The start of the text in the code point array.
+ /// The code span.
+ /// The start of the text in the code point span.
/// The number of code points.
/// a char array representing the code points between offset and count.
// LUCENENET NOTE: This code was originally in the NewString() method (above).
// It has been refactored from the original to remove the exception throw/catch and
- // instead proactively resizes the array instead of relying on excpetions + copy operations
- public static char[] ToCharArray(int[] codePoints, int offset, int count)
+ // instead proactively resizes the array instead of relying on exceptions + copy operations
+ public static char[] ToCharArray(ReadOnlySpan codePoints, int offset, int count)
{
if (count < 0)
{
throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
}
- const int countThreashold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
+ const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
// LUCENENET: as a first approximation, assume each codepoint
// is 2 characters (since it cannot be longer than this)
int arrayLength = count * 2;
- // LUCENENET: if we go over the threashold, count the number of
+ // LUCENENET: if we go over the threshold, count the number of
// chars we will need so we can allocate the precise amount of memory
- if (count > countThreashold)
+ if (count > countThreshold)
{
arrayLength = 0;
for (int r = offset, e = offset + count; r < e; ++r)
@@ -951,15 +950,18 @@ public static string ToHexString(string s)
}
///
- /// Interprets the given byte array as UTF-8 and converts to UTF-16. The will be extended if
+ /// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if
/// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
///
/// NOTE: Full characters are read, even if this reads past the length passed (and
/// can result in an if invalid UTF-8 is passed).
/// Explicit checks for valid UTF-8 are not performed.
///
+ ///
+ /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[].
+ ///
// TODO: broken if chars.offset != 0
- public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
+ public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, CharsRef chars)
{
int out_offset = chars.Offset = 0;
char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length);
@@ -1001,9 +1003,85 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha
chars.Length = out_offset - chars.Offset;
}
+ #nullable enable
+ ///
+ /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new .
+ ///
+ /// NOTE: Explicit checks for valid UTF-8 are not performed.
+ ///
+ ///
+ /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[].
+ ///
+ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars)
+ {
+ CharsRef result = new CharsRef(length);
+ int out_offset = 0;
+ char[] @out = result.Chars;
+ int limit = offset + length;
+ while (offset < limit)
+ {
+ if (utf8.Length <= offset)
+ {
+ chars = null;
+ return false;
+ }
+
+ int b = utf8[offset++] & 0xff;
+ if (b < 0xc0)
+ {
+ if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
+ @out[out_offset++] = (char)b;
+ }
+ else if (b < 0xe0)
+ {
+ if (utf8.Length <= offset)
+ {
+ chars = null;
+ return false;
+ }
+ @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
+ }
+ else if (b < 0xf0)
+ {
+ if (utf8.Length <= offset + 1)
+ {
+ chars = null;
+ return false;
+ }
+ @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
+ offset += 2;
+ }
+ else
+ {
+ if (utf8.Length <= offset + 2)
+ {
+ chars = null;
+ return false;
+ }
+ if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
+ int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
+ offset += 3;
+ if (ch < UNI_MAX_BMP)
+ {
+ @out[out_offset++] = (char)ch;
+ }
+ else
+ {
+ int chHalf = ch - 0x0010000;
+ @out[out_offset++] = (char)((chHalf >> 10) + 0xD800);
+ @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00);
+ }
+ }
+ }
+ result.Length = out_offset;
+ chars = result;
+ return true;
+ }
+ #nullable restore
+
///
- /// Utility method for
- ///
+ /// Utility method for
+ ///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
{