Skip to content

Commit

Permalink
[fix] use proper decode batch functions in _decode_batch
Browse files Browse the repository at this point in the history
  • Loading branch information
bhavnicksm committed Jan 4, 2025
1 parent 5b75303 commit 87b6306
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions src/chonkie/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,11 @@ def _decode(self, tokens) -> str:
def _decode_batch(self, token_lists: List[List[int]]) -> List[str]:
"""Decode a batch of token lists using the backend tokenizer."""
if self._tokenizer_backend == "transformers":
return [self.tokenizer.decode(tokens) for tokens in token_lists]
return self.tokenizer.batch_decode(token_lists, skip_special_tokens=True)
elif self._tokenizer_backend == "tokenizers":
return [self.tokenizer.decode(tokens) for tokens in token_lists]
return self.tokenizer.decode_batch(token_lists)
elif self._tokenizer_backend == "tiktoken":
return [self.tokenizer.decode(tokens) for tokens in token_lists]
return self.tokenizer.decode_batch(token_lists)
elif self._tokenizer_backend == "callable":
raise NotImplementedError(
"Callable tokenizer backend does not support batch decoding."
Expand Down

0 comments on commit 87b6306

Please sign in to comment.