Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize spm tokenizer for long text #799

Merged
merged 3 commits into from
Aug 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 53 additions & 21 deletions operators/tokenizer/bpe_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,53 @@ std::vector<int64_t> KernelBpeTokenizer::SpmTokenize(ustring& input,
// Get byte encodings prior to performing BPE
std::list<std::pair<uint32_t, uint32_t>> byte_list;

while (res.size() < max_length && char_pos < ustr.length()) {
while (res.size() < max_length && char_pos <= ustr.length()) {
bool split_now = false;
if (char_pos == ustr.length()) {
split_now = true;
}

// temporary split logic, will be replaced regex based split after it is implemented
if (!split_now && byte_list.size() > 10) {
auto is_split_char = [](char32_t ch) {
return ch == U' ' || ch == U'\n' || ch == U'\r' || ch == U'▁';
};
if (!is_split_char(ustr[char_pos - 1]) && is_split_char(ustr[char_pos])) {
split_now = true;
}
// split immediately to avoid too long byte_list for extreme cases, which is slow.
if (!split_now && byte_list.size() > 100) {
split_now = true;
}
}

if (split_now) {
// Perform BPE
bbpe_tokenizer_->PerformBPE(byte_list);

// Add output to result
for (auto p : byte_list) {
if (res.size() >= max_length) {
break;
}

res.push_back(p.first);

if (compute_offset_mapping) {
offset_mapping.emplace_back(std::make_pair(
offset,
ort_extensions::narrow<size_t>(offset + (size_t)p.second)));
offset += ((size_t)p.second);
}
}

byte_list.clear();
}

if (char_pos == ustr.length()) {
break;
}

auto chr = ustr[char_pos];
if (chr == U' ') {
chr = 0x2581; // UTF-8 string '\xe2\x96\x81'
Expand All @@ -436,33 +482,19 @@ std::vector<int64_t> KernelBpeTokenizer::SpmTokenize(ustring& input,

char_pos++;
}
{
// Perform BPE
bbpe_tokenizer_->PerformBPE(byte_list);

// Add output to result
for (auto p : byte_list) {
if (res.size() >= max_length) {
break;
}

res.push_back(p.first);

if (compute_offset_mapping) {
offset_mapping.emplace_back(std::make_pair(
offset,
ort_extensions::narrow<size_t>(offset + (size_t)p.second)));
offset += ((size_t)p.second);
}
}
}

if (compute_offset_mapping) {
// Add offset mappings for input in this instance to list of offset mappings for all inputs
offset_map.emplace_back(offset_mapping);
}
}

if (res.size() > 0 && res.front() == bos_token_id_) {
if (add_bos_token_.has_value() && add_bos_token_.value() == false) {
res.erase(res.begin());
}
}

return res;
}

Expand Down
Loading