From 7262637aa0b53da0885ee94d9174eb477dfa8f00 Mon Sep 17 00:00:00 2001 From: Jianhong-Zhang Date: Fri, 17 May 2024 09:24:37 -0700 Subject: [PATCH] add HFTokenizer option for preprocess_data --- tools/preprocess_data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 399f93c10e..652897a658 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -193,8 +193,10 @@ def get_args(): group.add_argument('--tokenizer-type', type=str, required=True, choices=['BertWordPieceLowerCase','BertWordPieceCase', 'GPT2BPETokenizer', 'SentencePieceTokenizer', - 'GPTSentencePieceTokenizer', 'NullTokenizer'], + 'GPTSentencePieceTokenizer', 'NullTokenizer', 'HFTokenizer'], help='What type of tokenizer to use.') + group.add_argument('--seq-length', type=int, default=1024, + help='The length of the packed inputs.') group.add_argument('--tokenizer-model', type=str, default=None, help='YTTM tokenizer model.') group.add_argument('--vocab-file', type=str, default=None,