diff --git a/runtime/interpreter/value.go b/runtime/interpreter/value.go index 3b7580bc23..f813b80e6a 100644 --- a/runtime/interpreter/value.go +++ b/runtime/interpreter/value.go @@ -1246,8 +1246,11 @@ var EmptyString = NewUnmeteredStringValue("") func (v *StringValue) Slice(from IntValue, to IntValue, locationRange LocationRange) Value { fromIndex := from.ToInt(locationRange) - toIndex := to.ToInt(locationRange) + return v.slice(fromIndex, toIndex, locationRange) +} + +func (v *StringValue) slice(fromIndex int, toIndex int, locationRange LocationRange) *StringValue { length := v.Length() @@ -1394,6 +1397,40 @@ func (v *StringValue) GetMember(interpreter *Interpreter, locationRange Location }, ) + case sema.StringTypeIndexFunctionName: + return NewBoundHostFunctionValue( + interpreter, + v, + sema.StringTypeIndexFunctionType, + func(invocation Invocation) Value { + other, ok := invocation.Arguments[0].(*StringValue) + if !ok { + panic(errors.NewUnreachableError()) + } + + return v.IndexOf(invocation.Interpreter, other) + }, + ) + + case sema.StringTypeCountFunctionName: + return NewBoundHostFunctionValue( + interpreter, + v, + sema.StringTypeIndexFunctionType, + func(invocation Invocation) Value { + other, ok := invocation.Arguments[0].(*StringValue) + if !ok { + panic(errors.NewUnreachableError()) + } + + return v.Count( + invocation.Interpreter, + invocation.LocationRange, + other, + ) + }, + ) + case sema.StringTypeDecodeHexFunctionName: return NewBoundHostFunctionValue( interpreter, @@ -1703,36 +1740,59 @@ func (v *StringValue) ForEach( } } -func (v *StringValue) IsBoundaryStart(start int) bool { +func (v *StringValue) IsGraphemeBoundaryStart(startOffset int) bool { v.prepareGraphemes() - return v.isGraphemeBoundaryStartPrepared(start) + + var characterIndex int + return v.seekGraphemeBoundaryStartPrepared(startOffset, &characterIndex) } -func (v *StringValue) isGraphemeBoundaryStartPrepared(start int) bool { +func (v *StringValue) seekGraphemeBoundaryStartPrepared(startOffset int, characterIndex *int) bool { - for { - boundaryStart, _ := v.graphemes.Positions() - if start == boundaryStart { - return true - } else if boundaryStart > start { - return false + for ; v.graphemes.Next(); *characterIndex++ { + + boundaryStart, boundaryEnd := v.graphemes.Positions() + if boundaryStart == boundaryEnd { + // Graphemes.Positions() should never return a zero-length grapheme, + // and only does so if the grapheme iterator + // - is at the beginning of the string and has not been initialized (i.e. Next() has not been called); or + // - is at the end of the string and has been exhausted (i.e. Next() has returned false) + panic(errors.NewUnreachableError()) } - if !v.graphemes.Next() { + if startOffset == boundaryStart { + return true + } else if boundaryStart > startOffset { return false } } + + return false } -func (v *StringValue) IsBoundaryEnd(end int) bool { +func (v *StringValue) IsGraphemeBoundaryEnd(end int) bool { v.prepareGraphemes() + v.graphemes.Next() + return v.isGraphemeBoundaryEndPrepared(end) } func (v *StringValue) isGraphemeBoundaryEndPrepared(end int) bool { + // Empty strings have no grapheme clusters, and therefore no boundaries + if len(v.Str) == 0 { + return false + } for { - _, boundaryEnd := v.graphemes.Positions() + boundaryStart, boundaryEnd := v.graphemes.Positions() + if boundaryStart == boundaryEnd { + // Graphemes.Positions() should never return a zero-length grapheme, + // and only does so if the grapheme iterator + // - is at the beginning of the string and has not been initialized (i.e. Next() has not been called); or + // - is at the end of the string and has been exhausted (i.e. Next() has returned false) + panic(errors.NewUnreachableError()) + } + if end == boundaryEnd { return true } else if boundaryEnd > end { @@ -1745,7 +1805,16 @@ func (v *StringValue) isGraphemeBoundaryEndPrepared(end int) bool { } } -func (v *StringValue) Contains(inter *Interpreter, other *StringValue) BoolValue { +func (v *StringValue) IndexOf(inter *Interpreter, other *StringValue) IntValue { + index := v.indexOf(inter, other) + return NewIntValueFromInt64(inter, int64(index)) +} + +func (v *StringValue) indexOf(inter *Interpreter, other *StringValue) int { + + if len(other.Str) == 0 { + return 0 + } // Meter computation as if the string was iterated. // This is a conservative over-estimation. @@ -1753,22 +1822,93 @@ func (v *StringValue) Contains(inter *Interpreter, other *StringValue) BoolValue v.prepareGraphemes() - for start := 0; start < len(v.Str); start++ { + // We are dealing with two different positions / indices / measures: + // - 'CharacterIndex' indicates Cadence characters (grapheme clusters) + // - 'ByteOffset' indicates bytes + + // The resulting index, in terms of Cadence characters (grapheme clusters) + var characterIndex int + + // Find the position of the substring in the string, + // by using strings.Index with an increasing start byte offset. + // + // The byte offset returned from strings.Index is the start of the substring in the string, + // but it may not be at a grapheme boundary, so we need to check + // that both the start and end byte offsets are grapheme boundaries. + // + // We do not have a way to translate a byte offset into a character index. + // Instead, we iterate over the grapheme clusters until we reach the byte offset, + // keeping track of the character index. + // + // We need to back up and restore the grapheme iterator and character index + // when either the start or the end byte offset are not grapheme boundaries, + // so the next iteration can start from the correct position. + + for searchStartByteOffset := 0; searchStartByteOffset < len(v.Str); searchStartByteOffset++ { - start = strings.Index(v.Str[start:], other.Str) - if start < 0 { + relativeFoundByteOffset := strings.Index(v.Str[searchStartByteOffset:], other.Str) + if relativeFoundByteOffset < 0 { break } - if v.isGraphemeBoundaryStartPrepared(start) && - v.isGraphemeBoundaryEndPrepared(start+len(other.Str)) { + // The resulting found byte offset is relative to the search start byte offset, + // so we need to add the search start byte offset to get the absolute byte offset + absoluteFoundByteOffset := searchStartByteOffset + relativeFoundByteOffset + + // Back up the grapheme iterator and character index, + // so the iteration state can be restored + // in case the byte offset is not at a grapheme boundary + graphemesBackup := *v.graphemes + characterIndexBackup := characterIndex - return TrueValue + if v.seekGraphemeBoundaryStartPrepared(absoluteFoundByteOffset, &characterIndex) && + v.isGraphemeBoundaryEndPrepared(absoluteFoundByteOffset+len(other.Str)) { + + return characterIndex } + + // Restore the grapheme iterator and character index + v.graphemes = &graphemesBackup + characterIndex = characterIndexBackup } - return FalseValue + return -1 +} + +func (v *StringValue) Contains(inter *Interpreter, other *StringValue) BoolValue { + return AsBoolValue(v.indexOf(inter, other) >= 0) +} + +func (v *StringValue) Count(inter *Interpreter, locationRange LocationRange, other *StringValue) IntValue { + index := v.count(inter, locationRange, other) + return NewIntValueFromInt64(inter, int64(index)) +} + +func (v *StringValue) count(inter *Interpreter, locationRange LocationRange, other *StringValue) int { + if other.Length() == 0 { + return 1 + v.Length() + } + + // Meter computation as if the string was iterated. + inter.ReportComputation(common.ComputationKindLoop, uint(len(v.Str))) + + remaining := v + count := 0 + + for { + index := remaining.indexOf(inter, other) + if index == -1 { + return count + } + + count++ + remaining = remaining.slice( + index+other.Length(), + remaining.Length(), + locationRange, + ) + } } type StringValueIterator struct { diff --git a/runtime/interpreter/value_test.go b/runtime/interpreter/value_test.go index 1a0f82c6fe..c1c7d9a17b 100644 --- a/runtime/interpreter/value_test.go +++ b/runtime/interpreter/value_test.go @@ -4392,7 +4392,7 @@ func TestValue_ConformsToStaticType(t *testing.T) { } -func TestStringIsBoundaryStart(t *testing.T) { +func TestStringIsGraphemeBoundaryStart(t *testing.T) { t.Parallel() @@ -4402,11 +4402,11 @@ func TestStringIsBoundaryStart(t *testing.T) { t.Run(name, func(t *testing.T) { str := NewUnmeteredStringValue(s) - assert.Equal(t, expected, str.IsBoundaryStart(i)) + assert.Equal(t, expected, str.IsGraphemeBoundaryStart(i)) }) } - test("", 0, true) + test("", 0, false) test("a", 0, true) test("a", 1, false) test("ab", 1, true) @@ -4433,7 +4433,7 @@ func TestStringIsBoundaryStart(t *testing.T) { test(flagESflagEE, 15, false) } -func TestStringIsBoundaryEnd(t *testing.T) { +func TestStringIsGraphemeBoundaryEnd(t *testing.T) { t.Parallel() @@ -4443,19 +4443,19 @@ func TestStringIsBoundaryEnd(t *testing.T) { t.Run(name, func(t *testing.T) { str := NewUnmeteredStringValue(s) - assert.Equal(t, expected, str.IsBoundaryEnd(i)) + assert.Equal(t, expected, str.IsGraphemeBoundaryEnd(i)) }) } - test("", 0, true) - test("a", 0, true) + test("", 0, false) + test("a", 0, false) test("a", 1, true) test("ab", 1, true) // πŸ‡ͺπŸ‡ΈπŸ‡ͺπŸ‡ͺ ("ES", "EE") flagESflagEE := "\U0001F1EA\U0001F1F8\U0001F1EA\U0001F1EA" require.Len(t, flagESflagEE, 16) - test(flagESflagEE, 0, true) + test(flagESflagEE, 0, false) test(flagESflagEE, 1, false) test(flagESflagEE, 2, false) test(flagESflagEE, 3, false) @@ -4472,4 +4472,7 @@ func TestStringIsBoundaryEnd(t *testing.T) { test(flagESflagEE, 13, false) test(flagESflagEE, 14, false) test(flagESflagEE, 15, false) + + test(flagESflagEE, 16, true) + } diff --git a/runtime/sema/string_type.go b/runtime/sema/string_type.go index 5ce6c2f1fb..7318e7fbb7 100644 --- a/runtime/sema/string_type.go +++ b/runtime/sema/string_type.go @@ -129,6 +129,18 @@ func init() { StringTypeContainsFunctionType, stringTypeContainsFunctionDocString, ), + NewUnmeteredPublicFunctionMember( + t, + StringTypeIndexFunctionName, + StringTypeIndexFunctionType, + stringTypeIndexFunctionDocString, + ), + NewUnmeteredPublicFunctionMember( + t, + StringTypeCountFunctionName, + StringTypeCountFunctionType, + stringTypeCountFunctionDocString, + ), }) } } @@ -194,6 +206,46 @@ const stringTypeContainsFunctionDocString = ` Returns true if this string contains the given other string as a substring. ` +var StringTypeIndexFunctionType = NewSimpleFunctionType( + FunctionPurityView, + []Parameter{ + { + Label: "of", + Identifier: "other", + TypeAnnotation: StringTypeAnnotation, + }, + }, + IntTypeAnnotation, +) + +const StringTypeIndexFunctionName = "index" + +const stringTypeIndexFunctionDocString = ` +Returns the index within this string of the first occurrence of the given substring. + +If the substring is not found, the function returns -1. +` + +var StringTypeCountFunctionType = NewSimpleFunctionType( + FunctionPurityView, + []Parameter{ + { + Label: ArgumentLabelNotRequired, + Identifier: "other", + TypeAnnotation: StringTypeAnnotation, + }, + }, + IntTypeAnnotation, +) + +const StringTypeCountFunctionName = "count" + +const stringTypeCountFunctionDocString = ` +Returns the number of non-overlapping instances of the given substring in this string. + +If the given substring is an empty string, the function returns 1 + the number of characters in this string. +` + const StringTypeReplaceAllFunctionName = "replaceAll" const StringTypeReplaceAllFunctionDocString = ` Returns a new string after replacing all the occurrences of parameter ` + "`of` with the parameter `with`" + `. diff --git a/runtime/tests/checker/string_test.go b/runtime/tests/checker/string_test.go index 756b6c9ec7..d857185350 100644 --- a/runtime/tests/checker/string_test.go +++ b/runtime/tests/checker/string_test.go @@ -579,3 +579,121 @@ func TestCheckStringContains(t *testing.T) { require.NoError(t, err) }) } + +func TestCheckStringIndex(t *testing.T) { + + t.Parallel() + + t.Run("missing argument", func(t *testing.T) { + + t.Parallel() + + _, err := ParseAndCheck(t, ` + let a = "abcdef" + let x: Int = a.index() + `) + + errs := RequireCheckerErrors(t, err, 1) + + assert.IsType(t, &sema.InsufficientArgumentsError{}, errs[0]) + }) + + t.Run("wrong argument type", func(t *testing.T) { + + t.Parallel() + + _, err := ParseAndCheck(t, ` + let a = "abcdef" + let x: Int = a.index(of: 1) + `) + + errs := RequireCheckerErrors(t, err, 1) + + assert.IsType(t, &sema.TypeMismatchError{}, errs[0]) + }) + + t.Run("wrong argument label", func(t *testing.T) { + + t.Parallel() + + _, err := ParseAndCheck(t, ` + let a = "abcdef" + let x: Int = a.index(foo: "bc") + `) + + errs := RequireCheckerErrors(t, err, 1) + + assert.IsType(t, &sema.IncorrectArgumentLabelError{}, errs[0]) + }) + + t.Run("missing argument label", func(t *testing.T) { + + t.Parallel() + + _, err := ParseAndCheck(t, ` + let a = "abcdef" + let x: Int = a.index("bc") + `) + + errs := RequireCheckerErrors(t, err, 1) + + assert.IsType(t, &sema.MissingArgumentLabelError{}, errs[0]) + }) + + t.Run("valid", func(t *testing.T) { + + t.Parallel() + + _, err := ParseAndCheck(t, ` + let a = "abcdef" + let x: Int = a.index(of: "bc") + `) + + require.NoError(t, err) + }) +} + +func TestCheckStringCount(t *testing.T) { + + t.Parallel() + + t.Run("missing argument", func(t *testing.T) { + + t.Parallel() + + _, err := ParseAndCheck(t, ` + let a = "abcdef" + let x: Int = a.count() + `) + + errs := RequireCheckerErrors(t, err, 1) + + assert.IsType(t, &sema.InsufficientArgumentsError{}, errs[0]) + }) + + t.Run("wrong argument type", func(t *testing.T) { + + t.Parallel() + + _, err := ParseAndCheck(t, ` + let a = "abcdef" + let x: Int = a.count(1) + `) + + errs := RequireCheckerErrors(t, err, 1) + + assert.IsType(t, &sema.TypeMismatchError{}, errs[0]) + }) + + t.Run("valid", func(t *testing.T) { + + t.Parallel() + + _, err := ParseAndCheck(t, ` + let a = "abcdef" + let x: Int = a.count("b") + `) + + require.NoError(t, err) + }) +} diff --git a/runtime/tests/interpreter/string_test.go b/runtime/tests/interpreter/string_test.go index b342e14ec9..6a276d245c 100644 --- a/runtime/tests/interpreter/string_test.go +++ b/runtime/tests/interpreter/string_test.go @@ -724,3 +724,147 @@ func TestInterpretStringContains(t *testing.T) { runTest(test) } } + +func TestInterpretStringIndex(t *testing.T) { + + t.Parallel() + + type test struct { + str string + subStr string + result int + } + + tests := []test{ + {"abcdef", "", 0}, + {"abcdef", "a", 0}, + {"abcdef", "ab", 0}, + {"abcdef", "ac", -1}, + {"abcdef", "b", 1}, + {"abcdef", "bc", 1}, + {"abcdef", "bcd", 1}, + {"abcdef", "c", 2}, + {"abcdef", "cd", 2}, + {"abcdef", "cdef", 2}, + {"abcdef", "cdefg", -1}, + {"abcdef", "abcdef", 0}, + {"abcdef", "abcdefg", -1}, + + // U+1F476 U+1F3FB is πŸ‘ΆπŸ» + {" \\u{1F476}\\u{1F3FB} ascii \\u{D}\\u{A}", " \\u{1F476}", -1}, + {" \\u{1F476}\\u{1F3FB} ascii \\u{D}\\u{A}", "\\u{1F3FB}", -1}, + {" \\u{1F476}\\u{1F3FB} ascii \\u{D}\\u{A}", " \\u{1F476}\\u{1F3FB}", 0}, + {" \\u{1F476}\\u{1F3FB} ascii \\u{D}\\u{A}", "\\u{1F476}\\u{1F3FB}", 1}, + {" \\u{1F476}\\u{1F3FB} ascii \\u{D}\\u{A}", "\\u{1F476}\\u{1F3FB} ", 1}, + {" \\u{1F476}\\u{1F3FB} ascii \\u{D}\\u{A}", "\\u{D}", -1}, + {" \\u{1F476}\\u{1F3FB} ascii \\u{D}\\u{A}", "\\u{A}", -1}, + {" \\u{1F476}\\u{1F3FB} ascii \\u{D}\\u{A}", " ascii ", 2}, + + // πŸ‡ͺπŸ‡ΈπŸ‡ͺπŸ‡ͺ ("ES", "EE") contains πŸ‡ͺπŸ‡Έ("ES") + {"\\u{1F1EA}\\u{1F1F8}\\u{1F1EA}\\u{1F1EA}", "\\u{1F1EA}\\u{1F1F8}", 0}, + // πŸ‡ͺπŸ‡ΈπŸ‡ͺπŸ‡ͺ ("ES", "EE") contains πŸ‡ͺπŸ‡ͺ ("EE") + {"\\u{1F1EA}\\u{1F1F8}\\u{1F1EA}\\u{1F1EA}", "\\u{1F1EA}\\u{1F1EA}", 1}, + // πŸ‡ͺπŸ‡ΈπŸ‡ͺπŸ‡ͺ ("ES", "EE") does NOT contain πŸ‡ΈπŸ‡ͺ ("SE") + {"\\u{1F1EA}\\u{1F1F8}\\u{1F1EA}\\u{1F1EA}", "\\u{1F1F8}\\u{1F1EA}", -1}, + // neither prefix nor suffix of codepoints are valid + {"\\u{1F1EA}\\u{1F1F8}\\u{1F1EA}\\u{1F1EA}", "\\u{1F1EA}\\u{1F1F8}\\u{1F1EA}", -1}, + {"\\u{1F1EA}\\u{1F1F8}\\u{1F1EA}\\u{1F1EA}", "\\u{1F1F8}\\u{1F1EA}\\u{1F1EA}", -1}, + } + + runTest := func(test test) { + + name := fmt.Sprintf("%s, %s", test.str, test.subStr) + + t.Run(name, func(t *testing.T) { + + t.Parallel() + + inter := parseCheckAndInterpret(t, + fmt.Sprintf( + ` + fun test(): Int { + let s = "%s" + return s.index(of: "%s") + } + `, + test.str, + test.subStr, + ), + ) + + value, err := inter.Invoke("test") + require.NoError(t, err) + + require.IsType(t, interpreter.IntValue{}, value) + actual := value.(interpreter.IntValue) + require.Equal(t, test.result, actual.ToInt(interpreter.EmptyLocationRange)) + }) + } + + for _, test := range tests { + runTest(test) + } +} + +func TestInterpretStringCount(t *testing.T) { + + t.Parallel() + + type test struct { + str string + subStr string + result int + } + + tests := []test{ + {"", "", 1}, + {"abcdef", "", 7}, + + {"", "notempty", 0}, + {"notempty", "", 9}, + {"smaller", "not smaller", 0}, + {"12345678987654321", "6", 2}, + {"611161116", "6", 3}, + {"notequal", "NotEqual", 0}, + {"equal", "equal", 1}, + {"abc1231231123q", "123", 3}, + {"11111", "11", 2}, + + // πŸ‡ͺπŸ‡ΈπŸ‡ͺπŸ‡ͺπŸ‡ͺπŸ‡Έ ("ES", "EE", "ES") contains πŸ‡ͺπŸ‡Έ("ES") twice + {"\\u{1F1EA}\\u{1F1F8}\\u{1F1EA}\\u{1F1EA}\\u{1F1EA}\\u{1F1F8}", "\\u{1F1EA}\\u{1F1F8}", 2}, + } + + runTest := func(test test) { + + name := fmt.Sprintf("%s, %s", test.str, test.subStr) + + t.Run(name, func(t *testing.T) { + + t.Parallel() + + inter := parseCheckAndInterpret(t, + fmt.Sprintf( + ` + fun test(): Int { + let s = "%s" + return s.count("%s") + } + `, + test.str, + test.subStr, + ), + ) + + value, err := inter.Invoke("test") + require.NoError(t, err) + + require.IsType(t, interpreter.IntValue{}, value) + actual := value.(interpreter.IntValue) + require.Equal(t, test.result, actual.ToInt(interpreter.EmptyLocationRange)) + }) + } + + for _, test := range tests { + runTest(test) + } +}