Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add String.index and String.count, fix grapheme boundary functions #3456

Merged
merged 1 commit into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 161 additions & 21 deletions runtime/interpreter/value.go
Original file line number Diff line number Diff line change
Expand Up @@ -1246,8 +1246,11 @@ var EmptyString = NewUnmeteredStringValue("")

func (v *StringValue) Slice(from IntValue, to IntValue, locationRange LocationRange) Value {
fromIndex := from.ToInt(locationRange)

toIndex := to.ToInt(locationRange)
return v.slice(fromIndex, toIndex, locationRange)
}

func (v *StringValue) slice(fromIndex int, toIndex int, locationRange LocationRange) *StringValue {

length := v.Length()

Expand Down Expand Up @@ -1394,6 +1397,40 @@ func (v *StringValue) GetMember(interpreter *Interpreter, locationRange Location
},
)

case sema.StringTypeIndexFunctionName:
return NewBoundHostFunctionValue(
interpreter,
v,
sema.StringTypeIndexFunctionType,
func(invocation Invocation) Value {
other, ok := invocation.Arguments[0].(*StringValue)
if !ok {
panic(errors.NewUnreachableError())
}

return v.IndexOf(invocation.Interpreter, other)
},
)

case sema.StringTypeCountFunctionName:
return NewBoundHostFunctionValue(
interpreter,
v,
sema.StringTypeIndexFunctionType,
func(invocation Invocation) Value {
other, ok := invocation.Arguments[0].(*StringValue)
if !ok {
panic(errors.NewUnreachableError())
}

return v.Count(
invocation.Interpreter,
invocation.LocationRange,
other,
)
},
)

case sema.StringTypeDecodeHexFunctionName:
return NewBoundHostFunctionValue(
interpreter,
Expand Down Expand Up @@ -1703,36 +1740,59 @@ func (v *StringValue) ForEach(
}
}

func (v *StringValue) IsBoundaryStart(start int) bool {
func (v *StringValue) IsGraphemeBoundaryStart(startOffset int) bool {
v.prepareGraphemes()
return v.isGraphemeBoundaryStartPrepared(start)

var characterIndex int
return v.seekGraphemeBoundaryStartPrepared(startOffset, &characterIndex)
}

func (v *StringValue) isGraphemeBoundaryStartPrepared(start int) bool {
func (v *StringValue) seekGraphemeBoundaryStartPrepared(startOffset int, characterIndex *int) bool {

for {
boundaryStart, _ := v.graphemes.Positions()
if start == boundaryStart {
return true
} else if boundaryStart > start {
return false
for ; v.graphemes.Next(); *characterIndex++ {

boundaryStart, boundaryEnd := v.graphemes.Positions()
if boundaryStart == boundaryEnd {
// Graphemes.Positions() should never return a zero-length grapheme,
// and only does so if the grapheme iterator
// - is at the beginning of the string and has not been initialized (i.e. Next() has not been called); or
// - is at the end of the string and has been exhausted (i.e. Next() has returned false)
panic(errors.NewUnreachableError())
}

if !v.graphemes.Next() {
if startOffset == boundaryStart {
return true
} else if boundaryStart > startOffset {
return false
}
}

return false
}

func (v *StringValue) IsBoundaryEnd(end int) bool {
func (v *StringValue) IsGraphemeBoundaryEnd(end int) bool {
v.prepareGraphemes()
v.graphemes.Next()

return v.isGraphemeBoundaryEndPrepared(end)
}

func (v *StringValue) isGraphemeBoundaryEndPrepared(end int) bool {
// Empty strings have no grapheme clusters, and therefore no boundaries
if len(v.Str) == 0 {
return false
}

for {
_, boundaryEnd := v.graphemes.Positions()
boundaryStart, boundaryEnd := v.graphemes.Positions()
if boundaryStart == boundaryEnd {
// Graphemes.Positions() should never return a zero-length grapheme,
// and only does so if the grapheme iterator
// - is at the beginning of the string and has not been initialized (i.e. Next() has not been called); or
// - is at the end of the string and has been exhausted (i.e. Next() has returned false)
panic(errors.NewUnreachableError())
}

if end == boundaryEnd {
return true
} else if boundaryEnd > end {
Expand All @@ -1745,30 +1805,110 @@ func (v *StringValue) isGraphemeBoundaryEndPrepared(end int) bool {
}
}

func (v *StringValue) Contains(inter *Interpreter, other *StringValue) BoolValue {
func (v *StringValue) IndexOf(inter *Interpreter, other *StringValue) IntValue {
index := v.indexOf(inter, other)
return NewIntValueFromInt64(inter, int64(index))
}

func (v *StringValue) indexOf(inter *Interpreter, other *StringValue) int {

if len(other.Str) == 0 {
return 0
}

// Meter computation as if the string was iterated.
// This is a conservative over-estimation.
inter.ReportComputation(common.ComputationKindLoop, uint(len(v.Str)*len(other.Str)))

v.prepareGraphemes()

for start := 0; start < len(v.Str); start++ {
// We are dealing with two different positions / indices / measures:
// - 'CharacterIndex' indicates Cadence characters (grapheme clusters)
// - 'ByteOffset' indicates bytes

// The resulting index, in terms of Cadence characters (grapheme clusters)
var characterIndex int

// Find the position of the substring in the string,
// by using strings.Index with an increasing start byte offset.
//
// The byte offset returned from strings.Index is the start of the substring in the string,
// but it may not be at a grapheme boundary, so we need to check
// that both the start and end byte offsets are grapheme boundaries.
//
// We do not have a way to translate a byte offset into a character index.
// Instead, we iterate over the grapheme clusters until we reach the byte offset,
// keeping track of the character index.
//
// We need to back up and restore the grapheme iterator and character index
// when either the start or the end byte offset are not grapheme boundaries,
// so the next iteration can start from the correct position.

for searchStartByteOffset := 0; searchStartByteOffset < len(v.Str); searchStartByteOffset++ {

start = strings.Index(v.Str[start:], other.Str)
if start < 0 {
relativeFoundByteOffset := strings.Index(v.Str[searchStartByteOffset:], other.Str)
if relativeFoundByteOffset < 0 {
break
}

if v.isGraphemeBoundaryStartPrepared(start) &&
v.isGraphemeBoundaryEndPrepared(start+len(other.Str)) {
// The resulting found byte offset is relative to the search start byte offset,
// so we need to add the search start byte offset to get the absolute byte offset
absoluteFoundByteOffset := searchStartByteOffset + relativeFoundByteOffset

// Back up the grapheme iterator and character index,
// so the iteration state can be restored
// in case the byte offset is not at a grapheme boundary
graphemesBackup := *v.graphemes
characterIndexBackup := characterIndex

return TrueValue
if v.seekGraphemeBoundaryStartPrepared(absoluteFoundByteOffset, &characterIndex) &&
v.isGraphemeBoundaryEndPrepared(absoluteFoundByteOffset+len(other.Str)) {

return characterIndex
}

// Restore the grapheme iterator and character index
v.graphemes = &graphemesBackup
characterIndex = characterIndexBackup
}

return FalseValue
return -1
}

func (v *StringValue) Contains(inter *Interpreter, other *StringValue) BoolValue {
return AsBoolValue(v.indexOf(inter, other) >= 0)
}

func (v *StringValue) Count(inter *Interpreter, locationRange LocationRange, other *StringValue) IntValue {
index := v.count(inter, locationRange, other)
return NewIntValueFromInt64(inter, int64(index))
}

func (v *StringValue) count(inter *Interpreter, locationRange LocationRange, other *StringValue) int {
if other.Length() == 0 {
return 1 + v.Length()
}

// Meter computation as if the string was iterated.
inter.ReportComputation(common.ComputationKindLoop, uint(len(v.Str)))

remaining := v
count := 0

for {
index := remaining.indexOf(inter, other)
if index == -1 {
return count
}

count++

remaining = remaining.slice(
index+other.Length(),
remaining.Length(),
locationRange,
)
}
}

type StringValueIterator struct {
Expand Down
19 changes: 11 additions & 8 deletions runtime/interpreter/value_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4392,7 +4392,7 @@ func TestValue_ConformsToStaticType(t *testing.T) {

}

func TestStringIsBoundaryStart(t *testing.T) {
func TestStringIsGraphemeBoundaryStart(t *testing.T) {

t.Parallel()

Expand All @@ -4402,11 +4402,11 @@ func TestStringIsBoundaryStart(t *testing.T) {

t.Run(name, func(t *testing.T) {
str := NewUnmeteredStringValue(s)
assert.Equal(t, expected, str.IsBoundaryStart(i))
assert.Equal(t, expected, str.IsGraphemeBoundaryStart(i))
})
}

test("", 0, true)
test("", 0, false)
test("a", 0, true)
test("a", 1, false)
test("ab", 1, true)
Expand All @@ -4433,7 +4433,7 @@ func TestStringIsBoundaryStart(t *testing.T) {
test(flagESflagEE, 15, false)
}

func TestStringIsBoundaryEnd(t *testing.T) {
func TestStringIsGraphemeBoundaryEnd(t *testing.T) {

t.Parallel()

Expand All @@ -4443,19 +4443,19 @@ func TestStringIsBoundaryEnd(t *testing.T) {

t.Run(name, func(t *testing.T) {
str := NewUnmeteredStringValue(s)
assert.Equal(t, expected, str.IsBoundaryEnd(i))
assert.Equal(t, expected, str.IsGraphemeBoundaryEnd(i))
})
}

test("", 0, true)
test("a", 0, true)
test("", 0, false)
test("a", 0, false)
test("a", 1, true)
test("ab", 1, true)

// 🇪🇸🇪🇪 ("ES", "EE")
flagESflagEE := "\U0001F1EA\U0001F1F8\U0001F1EA\U0001F1EA"
require.Len(t, flagESflagEE, 16)
test(flagESflagEE, 0, true)
test(flagESflagEE, 0, false)
test(flagESflagEE, 1, false)
test(flagESflagEE, 2, false)
test(flagESflagEE, 3, false)
Expand All @@ -4472,4 +4472,7 @@ func TestStringIsBoundaryEnd(t *testing.T) {
test(flagESflagEE, 13, false)
test(flagESflagEE, 14, false)
test(flagESflagEE, 15, false)

test(flagESflagEE, 16, true)

}
52 changes: 52 additions & 0 deletions runtime/sema/string_type.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,18 @@ func init() {
StringTypeContainsFunctionType,
stringTypeContainsFunctionDocString,
),
NewUnmeteredPublicFunctionMember(
t,
StringTypeIndexFunctionName,
StringTypeIndexFunctionType,
stringTypeIndexFunctionDocString,
),
NewUnmeteredPublicFunctionMember(
t,
StringTypeCountFunctionName,
StringTypeCountFunctionType,
stringTypeCountFunctionDocString,
),
})
}
}
Expand Down Expand Up @@ -194,6 +206,46 @@ const stringTypeContainsFunctionDocString = `
Returns true if this string contains the given other string as a substring.
`

var StringTypeIndexFunctionType = NewSimpleFunctionType(
FunctionPurityView,
[]Parameter{
{
Label: "of",
Identifier: "other",
TypeAnnotation: StringTypeAnnotation,
},
},
IntTypeAnnotation,
)

const StringTypeIndexFunctionName = "index"

const stringTypeIndexFunctionDocString = `
Returns the index within this string of the first occurrence of the given substring.

If the substring is not found, the function returns -1.
`

var StringTypeCountFunctionType = NewSimpleFunctionType(
FunctionPurityView,
[]Parameter{
{
Label: ArgumentLabelNotRequired,
Identifier: "other",
TypeAnnotation: StringTypeAnnotation,
},
},
IntTypeAnnotation,
)

const StringTypeCountFunctionName = "count"

const stringTypeCountFunctionDocString = `
Returns the number of non-overlapping instances of the given substring in this string.

If the given substring is an empty string, the function returns 1 + the number of characters in this string.
`

const StringTypeReplaceAllFunctionName = "replaceAll"
const StringTypeReplaceAllFunctionDocString = `
Returns a new string after replacing all the occurrences of parameter ` + "`of` with the parameter `with`" + `.
Expand Down
Loading
Loading