forked from kwonmha/bert-vocab-builder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubword_builder.py
86 lines (71 loc) · 3.52 KB
/
subword_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#-*- coding: utf-8 -*-
# Copyright 2018 The Tensor2Tensor Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Program to build a SubwordTextEncoder.
The flags --min_count and --corpus_max_lines will affect the size of the
vocabulary. Try changing these flags until you get a vocabulary
of the size you want.
Example usage:
python data_generators/text_encoder_build_subword.py \
--corpus_filepattern=$DATA_DIR/my_problem-train-* \
--corpus_max_lines=12345 \
--output_filename=$DATA_DIR/my_problem.subword_text_encoder \
"""
from __future__ import absolute_import
from __future__ import division
import text_encoder
import tokenizer
import tensorflow as tf
tf.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder',
'where to store the SubwordTextEncoder')
tf.flags.DEFINE_string('corpus_filepattern', '',
'Corpus of one or more text files')
tf.flags.DEFINE_string('vocab_filepattern', '', 'One or more vocabulary files '
'(one word per line as "word,count")')
tf.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus')
tf.flags.DEFINE_integer('corpus_max_lines', None,
'How many lines of corpus to read')
tf.flags.DEFINE_integer('num_iterations', 5, 'Number of iterations')
tf.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.')
tf.flags.DEFINE_string('additional_chars', "", 'Set special characters to be included in vocab. ex : "~", "/".')
tf.flags.DEFINE_integer('max_subtoken_length', None, 'Max subtoken length')
tf.flags.DEFINE_bool('backward', False, 'Builds subwords from backward.')
tf.flags.DEFINE_string('log_level', 'INFO', 'Set verbosity of logger')
FLAGS = tf.flags.FLAGS
def main(unused_argv):
if FLAGS.log_level not in ['DEBUG', 'INFO', 'ERROR']:
raise ValueError('Set verbosity among "DEBUG", "INFO", "ERROR"')
tf.logging.set_verbosity(FLAGS.log_level)
if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
raise ValueError(
'Must only provide one of --corpus_filepattern or --vocab_filepattern')
elif FLAGS.corpus_filepattern:
token_counts = tokenizer.corpus_token_counts(
FLAGS.corpus_filepattern,
FLAGS.corpus_max_lines,
split_on_newlines=FLAGS.split_on_newlines, additional_chars=FLAGS.additional_chars)
elif FLAGS.vocab_filepattern:
token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
FLAGS.corpus_max_lines)
else:
raise ValueError(
'Must provide one of --corpus_filepattern or --vocab_filepattern')
encoder = text_encoder.SubwordTextEncoder()
encoder.build_from_token_counts(token_counts, FLAGS.min_count,
FLAGS.num_iterations, max_subtoken_length=FLAGS.max_subtoken_length,
backward=FLAGS.backward)
encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
#encoder.store_to_file_with_counts(FLAGS.output_filename + "_counts")
if __name__ == '__main__':
tf.app.run()