Skip to content

Commit

Permalink
compact text splitter sentence and word breaks delimiters
Browse files Browse the repository at this point in the history
  • Loading branch information
dantelmomsft committed Aug 29, 2024
1 parent 4f8fb6e commit d540b32
Showing 1 changed file with 3 additions and 25 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.microsoft.openai.samples.indexer.parser;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import com.microsoft.openai.samples.indexer.SplitPage;
Expand All @@ -24,31 +25,8 @@ public TextSplitter(boolean verbose) {
}

public TextSplitter(boolean verbose, int maxSectionLength, int sentenceSearchLimit, int sectionOverlap) {
this.sentenceEndings = new ArrayList<>();
this.sentenceEndings.add(".");
this.sentenceEndings.add("。");
this.sentenceEndings.add(".");
this.sentenceEndings.add("!");
this.sentenceEndings.add("?");
this.sentenceEndings.add("‼");
this.sentenceEndings.add("⁇");
this.sentenceEndings.add("⁈");
this.sentenceEndings.add("⁉");

this.wordBreaks = new ArrayList<>();
this.wordBreaks.add(",");
this.wordBreaks.add("、");
this.wordBreaks.add(";");
this.wordBreaks.add(":");
this.wordBreaks.add(" ");
this.wordBreaks.add("(");
this.wordBreaks.add(")");
this.wordBreaks.add("[");
this.wordBreaks.add("]");
this.wordBreaks.add("{");
this.wordBreaks.add("}");
this.wordBreaks.add("\t");
this.wordBreaks.add("\n");
this.sentenceEndings = Arrays.asList(".", "。", ".", "!", "?", "‼", "⁇", "⁈", "⁉");
this.wordBreaks = Arrays.asList(",", "、", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n");

this.maxSectionLength = maxSectionLength;
this.sentenceSearchLimit = sentenceSearchLimit;
Expand Down

0 comments on commit d540b32

Please sign in to comment.