Skip to content

Commit

Permalink
Some more refactoring work to reduce complexity and centralize handli…
Browse files Browse the repository at this point in the history
…ng of string literals in the parse_string() function instead of scattering the logic around
  • Loading branch information
mangiucugna committed May 2, 2024
1 parent d7938d2 commit 3e99a99
Showing 1 changed file with 27 additions and 16 deletions.
43 changes: 27 additions & 16 deletions src/json_repair/json_repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,12 @@ def parse_json(
"info",
)
return ""
# <string> starts with '"'
elif char in ['"', "'", "“"]:
# <string> starts with a quote
elif char in ['"', "'", "“"] or char.isalpha():
return self.parse_string()
# <number> starts with [0-9] or minus
elif char.isdigit() or char == "-" or char == ".":
return self.parse_number()
# <boolean> could be (T)rue or (F)alse or (N)ull
elif char.lower() in ["t", "f", "n"]:
return self.parse_boolean_or_null()
# This might be a <string> that is missing the starting '"'
# but this case can happen here only if we are parsing an object or array
elif self.get_context() != "" and char.isalpha():
return self.parse_string()
# If everything else fails, we just ignore and move on
else:
self.index += 1
Expand Down Expand Up @@ -192,6 +185,10 @@ def parse_array(self) -> List[Any]:
char = self.get_char_at()
# If this is the right value of an object and we are closing the object, it means the array is over
if self.get_context() == "object_value" and char == "}":
self.log(
"While parsing an array inside an object, we got to the end without finding a ]. Stopped parsing",
"info",
)
break

# Especially at the end of an LLM generated json you might miss the last "]"
Expand Down Expand Up @@ -230,6 +227,26 @@ def parse_string(self) -> str:
while char and char not in ['"', "'", "“"] and not char.isalpha():
self.index += 1
char = self.get_char_at()

if char.isalpha():
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
if char.lower() in ["t", "f", "n"]:
value = self.parse_boolean_or_null()
if value != "":
return value
self.log(
"While parsing a string, we found a literal instead of a quote",
"info",
)
if self.get_context() == "":
# A string literal in the wild isn't a valid json and not something we can fix
self.log(
"While parsing a string, we found a literal outside of context, ignoring it",
"info",
)
self.index += 1
return self.parse_json()

# Ensuring we use the right delimiter
if char == "'":
lstring_delimiter = rstring_delimiter = "'"
Expand Down Expand Up @@ -401,13 +418,7 @@ def parse_boolean_or_null(self) -> Union[bool, str, None]:
return value

# If nothing works
# If we are in array or object parse a string
if self.get_context() != "":
return self.parse_string()
else:
# Otherwise, let's skip this character and keep parsing
self.index += 1
return self.parse_json()
return ""

def insert_char_at(self, char: str) -> None:
self.json_str = self.json_str[: self.index] + char + self.json_str[self.index :]
Expand Down

0 comments on commit 3e99a99

Please sign in to comment.