tokenizer = tensorflow_text.WhitespaceTokenizer() tokens = tokenizer.tokenize(['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(tokens.to_list())
[['everything', 'not', 'saved', 'will', 'be', 'lost.'], ['Sad\xe2\x98\xb9']]
tokenizer = tensorflow_text.UnicodeScriptTokenizer() tokens = tokenizer.tokenize(['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(tokens.to_list())
[['everything', 'not', 'saved', 'will', 'be', 'lost', '.'], ['Sad', '\xe2\x98\xb9']]
def _CreateTable(vocab, num_oov=1): init = tf.lookup.KeyValueTensorInitializer( vocab, tf.range(tf.size(vocab, out_type=tf.int64), dtype=tf.int64), key_dtype=tf.string, value_dtype=tf.int64) return tf.lookup.StaticVocabularyTable( init, num_oov, lookup_key_dtype=tf.string) vocab_table = _CreateTable(["great", "they", "the", "##'", "##re", "##est"]) tokens = [["they're", "the", "greatest"]] tokenizer = tensorflow_text.WordpieceTokenizer( vocab_table, token_out_type=tf.string) result = tokenizer.tokenize(tokens) print(result.to_list())
[[['they', "##'", '##re'], ['the'], ['great', '##est']]]
tokenizer = tensorflow_text.UnicodeScriptTokenizer() (tokens, offset_starts, offset_limits) = tokenizer.tokenize_with_offsets(['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(tokens.to_list()) print(offset_starts.to_list()) print(offset_limits.to_list())
[['everything', 'not', 'saved', 'will', 'be', 'lost', '.'], ['Sad', '\xe2\x98\xb9']] [[0, 11, 15, 21, 26, 29, 33], [0, 3]] [[10, 14, 20, 25, 28, 33, 34], [3, 6]]