Skip to content

Commit

Permalink
add code documentation for indexer
Browse files Browse the repository at this point in the history
  • Loading branch information
dantelmomsft committed Aug 5, 2024
1 parent b23bff9 commit 13eda4f
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* The DocumentProcessor class is responsible for processing and indexing documents.
* It takes a document as input, either as a file or as a byte array, and processes it for indexing.
* The processing involves
* 1. parsing the document into pages
* 2. splitting the pages into sections
* 3. Indexing these sections in Azure AI Search also adding embeddings so that semantic similarity search can be used.
* The class uses a SearchIndexManager to manage the indexing, a PDFParser to parse the document into pages, and a TextSplitter to split the pages into sections.
*/
public class DocumentProcessor {

private static final Logger logger = LoggerFactory.getLogger(DocumentProcessor.class);
Expand All @@ -38,6 +47,7 @@ public void indexDocumentfromFile(String filepath, String category) throws IOExc

public void indexDocumentFromBytes(String filename, String category, byte[] content){
logger.debug("Indexing file {}", filename);
//TODO add support for other file types (docx, pptx, txt, md, html, etc)
List<Page> pages = pdfParser.parse(content);
logger.info("Found {} pages in file {}", pages.size(), filename);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@
import com.knuddels.jtokkit.api.ModelType;
import reactor.util.retry.Retry;


/**
* This class provides a base implementation for creating text embeddings which are then stored in vector databases during batch indexing process.
* It creates embedding batch and split text into batches for performance reasons.
* It also includes fields for configuring batch size, token limit, and other configurations.
* The class uses OpenAI client to create the embeddings and handles retries in case of HTTP response exceptions.
*/
public abstract class AbstractTextEmbeddingsService implements TextEmbeddingsService{
protected String openAiDeploymentName;
protected boolean disableBatch;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@



/**
* The SearchIndexManager class is responsible for managing the Azure Search Index.
* It provides functionalities to create an index, update the content of the index,
* and manage the embeddings of the sections.
*/
public class SearchIndexManager {
private AzureSearchClientFactory azureSearchClientFactory;
private String searchAnalyzerName;
Expand All @@ -44,6 +49,12 @@ public SearchIndexManager(AzureSearchClientFactory azureSearchClientFactory, Str
this.embeddingsService = embeddingsService;
}

/**
* It creates a new index with specific fields and configurations. It also sets up semantic search and vector search
* configurations for the index.
* This is in general not used during runtime, but only during env setup.
* However, it's idempotent as it checks if the index already exists. If not it creates it.
*/
public void createIndex() {
if (azureSearchClientFactory.isVerbose()) {
logger.debug("Ensuring search index {} exists", azureSearchClientFactory.getIndexName());
Expand Down Expand Up @@ -128,6 +139,12 @@ public void createIndex() {
logger.info("Created index {}", azureSearchClientFactory.getIndexName());
}

/**
* It updates the content of the index. It divides the sections into batches and for each batch, it creates a list of documents. Each document
* is a map containing the section details.
* It also creates embeddings for each section and adds them to the corresponding document. Finally, it uploads the documents to the search client.
* @param sections
*/
public void updateContent(List<Section> sections) {
int MAX_BATCH_SIZE = 1000;
List<List<Section>> sectionBatches = new ArrayList<>();
Expand Down Expand Up @@ -161,7 +178,7 @@ public void updateContent(List<Section> sections) {
documents.get(i).put("embedding", embeddings.get(i));
}


//Finally updated the document to the index including embeddings vector as well
searchClient.uploadDocuments(documents);
}

Expand Down Expand Up @@ -200,8 +217,14 @@ public void removeContent(String path) {
}
*/




/**
*
* @param filename
* @param page
* @return the source page from the file page. If the file is a PDF, it appends the page number to the filename. Otherwise,it just returns the filename.
*/
private String getSourcePageFromFilePage(String filename, int page) {
if (filename.toLowerCase().endsWith(".pdf")) {
return filename + "#page=" + (page + 1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,23 @@
import org.slf4j.LoggerFactory;


/**
* This is an implementation of a PDF parser using Azure's Document Intelligence service.
* It is designed to extract text and table data from PDF files and convert them into a structured format.
*
* It initializes an instance of DocumentAnalysisClient from Azure's Document Intelligence service in the constructor.
* It provides two parse methods, one accepting a File object and another accepting a byte array. Both methods convert the input into BinaryData and pass it to a private parse method.
* The private parse method sends the BinaryData to Azure's Document Intelligence service for analysis. It then processes the analysis result, extracting text and table data from each page of the PDF. Tables are converted into HTML format.
* The tableToHtml method is used to convert a DocumentTable object into an HTML table. It handles row and column spans and escapes any HTML characters in the cell content.
*/
public class DocumentIntelligencePDFParser implements PDFParser {
private static final Logger logger = LoggerFactory.getLogger(DocumentIntelligencePDFParser.class);

private final DocumentAnalysisClient client;
private boolean verbose = false;
private String modelId = "prebuilt-layout";


public DocumentIntelligencePDFParser(String serviceName, TokenCredential tokenCredential, Boolean verbose) {
this.client = new DocumentAnalysisClientBuilder()
.endpoint("https://%s.cognitiveservices.azure.com/".formatted(serviceName))
Expand Down Expand Up @@ -66,64 +76,86 @@ public List<Page> parse(byte[] content) {
}

private List<Page> parse(BinaryData fileData) {
// Create a list to store the pages of the PDF
List<Page> pages = new ArrayList<>();

// Begin the document analysis process using Azure's Document Intelligence service
SyncPoller<OperationResult, AnalyzeResult> analyzeLayoutResultPoller =
client.beginAnalyzeDocument(this.modelId, fileData);
client.beginAnalyzeDocument(this.modelId, fileData);

// Get the final result of the document analysis
AnalyzeResult analyzeLayoutResult = analyzeLayoutResultPoller.getFinalResult();

int offset = 0;
// Loop through each page in the analyzed document
for (int page_num = 0; page_num < analyzeLayoutResult.getPages().size(); page_num++) {
DocumentPage page = analyzeLayoutResult.getPages().get(page_num);

// Create a list to store the tables on the current page
List<DocumentTable> tables_on_page = new ArrayList<>();

if(analyzeLayoutResult.getTables() != null){
// If there are tables in the analyzed document, add the tables on the current page to the list
if (analyzeLayoutResult.getTables() != null) {
for (DocumentTable table : analyzeLayoutResult.getTables()) {
BoundingRegion boundingRegion = table.getBoundingRegions().get(0);
if (boundingRegion.getPageNumber() == page_num + 1) {
tables_on_page.add(table);
}
}
}

DocumentSpan pageSpan = page.getSpans().get(0);
int pageOffset = pageSpan.getOffset();
int pageLength = pageSpan.getLength();

// Create an array to store the characters in the tables on the current page
int[] tableChars = new int[pageLength];
Arrays.fill(tableChars, -1);

// Loop through each table on the current page
for (int tableId = 0; tableId < tables_on_page.size(); tableId++) {
DocumentTable table = tables_on_page.get(tableId);


// Loop through each span in the current table and mark the characters in the table
for (DocumentSpan span : table.getSpans()) {
for (int i = 0; i < span.getLength(); i++) {
int idx = span.getOffset() - pageOffset + i;
// If the character is in the current table, store the table ID in the array
if (idx >= 0 && idx < pageLength) {
tableChars[idx] = tableId;
}
}
}
}

// Create a StringBuilder to store the text of the current page
StringBuilder pageText = new StringBuilder();

// Create a set to store the IDs of the tables that have been added to the page text
Set<Integer> addedTables = new HashSet<>();

// Loop through each character in the array
for (int idx = 0; idx < tableChars.length; idx++) {
int tableId = tableChars[idx];
if (tableId == -1) {
// If the character is not in a table, add it to the page text
pageText.append(analyzeLayoutResult.getContent().substring(pageOffset + idx, pageOffset + idx + 1));
} else if (!addedTables.contains(tableId)) {
// If the character is in a table and the table has not been added to the page text, add the table to the page text
DocumentTable table = tables_on_page.get(tableId);
pageText.append(tableToHtml(table));
addedTables.add(tableId);
}
}

pages.add( new Page(page_num, offset, pageText.toString()));
// Add the current page to the list of pages
pages.add(new Page(page_num, offset, pageText.toString()));

offset += pageText.length();

}
}
return pages;
}
}


private String tableToHtml(DocumentTable table) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
import java.util.List;
import java.util.ArrayList;

/**
* This is an implementation of a PDF parser using open source iText library.
* It can only handle text within pdf.
* Can't extract data from tables within images. See @DocumentIntelligencePDFParser for that.
*/
public class ItextPDFParser implements PDFParser {
@Override
public List<Page> parse(File file) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
import com.microsoft.openai.samples.indexer.SplitPage;


/**
* It's responsible for splitting the text content of a list of pages into smaller sections.
* It does this by identifying sentence endings and word breaks, and then using these to determine where to split the text.
* The class also has a maximum section length, a sentence search limit, and a section overlap, which are used to fine-tune the splitting process.
*/
public class TextSplitter {
private List<String> sentenceEndings;
private List<String> wordBreaks;
Expand All @@ -15,6 +20,10 @@ public class TextSplitter {
private boolean verbose;

public TextSplitter(boolean verbose) {
this(true,1000,100,100)
}

public TextSplitter(boolean verbose, int maxSectionLength, int sentenceSearchLimit, int sectionOverlap) {
this.sentenceEndings = new ArrayList<>();
this.sentenceEndings.add(".");
this.sentenceEndings.add("。");
Expand All @@ -41,12 +50,11 @@ public TextSplitter(boolean verbose) {
this.wordBreaks.add("\t");
this.wordBreaks.add("\n");

this.maxSectionLength = 1000;
this.sentenceSearchLimit = 100;
this.sectionOverlap = 100;
this.maxSectionLength = maxSectionLength;
this.sentenceSearchLimit = sentenceSearchLimit;
this.sectionOverlap = sectionOverlap;
this.verbose = verbose;
}

public List<SplitPage> splitPages(List<Page> pages) {
List<SplitPage> splitPages = new ArrayList<>();
StringBuilder allText = new StringBuilder();
Expand Down

0 comments on commit 13eda4f

Please sign in to comment.