From 13eda4fc383954d6996fcd0da3543850d2406897 Mon Sep 17 00:00:00 2001 From: dantelmomsft Date: Mon, 5 Aug 2024 12:01:09 +0200 Subject: [PATCH] add code documentation for indexer --- .../samples/indexer/DocumentProcessor.java | 10 ++++ .../AbstractTextEmbeddingsService.java | 7 +++ .../indexer/index/SearchIndexManager.java | 29 ++++++++++-- .../parser/DocumentIntelligencePDFParser.java | 46 ++++++++++++++++--- .../indexer/parser/ItextPDFParser.java | 5 ++ .../samples/indexer/parser/TextSplitter.java | 16 +++++-- 6 files changed, 99 insertions(+), 14 deletions(-) diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/DocumentProcessor.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/DocumentProcessor.java index 384ea4a..7fa2be5 100644 --- a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/DocumentProcessor.java +++ b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/DocumentProcessor.java @@ -14,6 +14,15 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +/** + * The DocumentProcessor class is responsible for processing and indexing documents. + * It takes a document as input, either as a file or as a byte array, and processes it for indexing. + * The processing involves + * 1. parsing the document into pages + * 2. splitting the pages into sections + * 3. Indexing these sections in Azure AI Search also adding embeddings so that semantic similarity search can be used. + * The class uses a SearchIndexManager to manage the indexing, a PDFParser to parse the document into pages, and a TextSplitter to split the pages into sections. + */ public class DocumentProcessor { private static final Logger logger = LoggerFactory.getLogger(DocumentProcessor.class); @@ -38,6 +47,7 @@ public void indexDocumentfromFile(String filepath, String category) throws IOExc public void indexDocumentFromBytes(String filename, String category, byte[] content){ logger.debug("Indexing file {}", filename); + //TODO add support for other file types (docx, pptx, txt, md, html, etc) List pages = pdfParser.parse(content); logger.info("Found {} pages in file {}", pages.size(), filename); diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/embeddings/AbstractTextEmbeddingsService.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/embeddings/AbstractTextEmbeddingsService.java index 61d43dd..cca46d7 100644 --- a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/embeddings/AbstractTextEmbeddingsService.java +++ b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/embeddings/AbstractTextEmbeddingsService.java @@ -20,6 +20,13 @@ import com.knuddels.jtokkit.api.ModelType; import reactor.util.retry.Retry; + +/** + * This class provides a base implementation for creating text embeddings which are then stored in vector databases during batch indexing process. + * It creates embedding batch and split text into batches for performance reasons. + * It also includes fields for configuring batch size, token limit, and other configurations. + * The class uses OpenAI client to create the embeddings and handles retries in case of HTTP response exceptions. + */ public abstract class AbstractTextEmbeddingsService implements TextEmbeddingsService{ protected String openAiDeploymentName; protected boolean disableBatch; diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/index/SearchIndexManager.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/index/SearchIndexManager.java index 409dffe..df5d178 100644 --- a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/index/SearchIndexManager.java +++ b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/index/SearchIndexManager.java @@ -31,6 +31,11 @@ +/** + * The SearchIndexManager class is responsible for managing the Azure Search Index. + * It provides functionalities to create an index, update the content of the index, + * and manage the embeddings of the sections. + */ public class SearchIndexManager { private AzureSearchClientFactory azureSearchClientFactory; private String searchAnalyzerName; @@ -44,6 +49,12 @@ public SearchIndexManager(AzureSearchClientFactory azureSearchClientFactory, Str this.embeddingsService = embeddingsService; } + /** + * It creates a new index with specific fields and configurations. It also sets up semantic search and vector search + * configurations for the index. + * This is in general not used during runtime, but only during env setup. + * However, it's idempotent as it checks if the index already exists. If not it creates it. + */ public void createIndex() { if (azureSearchClientFactory.isVerbose()) { logger.debug("Ensuring search index {} exists", azureSearchClientFactory.getIndexName()); @@ -128,6 +139,12 @@ public void createIndex() { logger.info("Created index {}", azureSearchClientFactory.getIndexName()); } + /** + * It updates the content of the index. It divides the sections into batches and for each batch, it creates a list of documents. Each document + * is a map containing the section details. + * It also creates embeddings for each section and adds them to the corresponding document. Finally, it uploads the documents to the search client. + * @param sections + */ public void updateContent(List
sections) { int MAX_BATCH_SIZE = 1000; List> sectionBatches = new ArrayList<>(); @@ -161,7 +178,7 @@ public void updateContent(List
sections) { documents.get(i).put("embedding", embeddings.get(i)); } - + //Finally updated the document to the index including embeddings vector as well searchClient.uploadDocuments(documents); } @@ -200,8 +217,14 @@ public void removeContent(String path) { } */ - - + + + /** + * + * @param filename + * @param page + * @return the source page from the file page. If the file is a PDF, it appends the page number to the filename. Otherwise,it just returns the filename. + */ private String getSourcePageFromFilePage(String filename, int page) { if (filename.toLowerCase().endsWith(".pdf")) { return filename + "#page=" + (page + 1); diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/DocumentIntelligencePDFParser.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/DocumentIntelligencePDFParser.java index 6037157..4f92d2e 100644 --- a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/DocumentIntelligencePDFParser.java +++ b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/DocumentIntelligencePDFParser.java @@ -32,6 +32,15 @@ import org.slf4j.LoggerFactory; +/** + * This is an implementation of a PDF parser using Azure's Document Intelligence service. + * It is designed to extract text and table data from PDF files and convert them into a structured format. + * + * It initializes an instance of DocumentAnalysisClient from Azure's Document Intelligence service in the constructor. + * It provides two parse methods, one accepting a File object and another accepting a byte array. Both methods convert the input into BinaryData and pass it to a private parse method. + * The private parse method sends the BinaryData to Azure's Document Intelligence service for analysis. It then processes the analysis result, extracting text and table data from each page of the PDF. Tables are converted into HTML format. + * The tableToHtml method is used to convert a DocumentTable object into an HTML table. It handles row and column spans and escapes any HTML characters in the cell content. + */ public class DocumentIntelligencePDFParser implements PDFParser { private static final Logger logger = LoggerFactory.getLogger(DocumentIntelligencePDFParser.class); @@ -39,6 +48,7 @@ public class DocumentIntelligencePDFParser implements PDFParser { private boolean verbose = false; private String modelId = "prebuilt-layout"; + public DocumentIntelligencePDFParser(String serviceName, TokenCredential tokenCredential, Boolean verbose) { this.client = new DocumentAnalysisClientBuilder() .endpoint("https://%s.cognitiveservices.azure.com/".formatted(serviceName)) @@ -66,18 +76,26 @@ public List parse(byte[] content) { } private List parse(BinaryData fileData) { + // Create a list to store the pages of the PDF List pages = new ArrayList<>(); + + // Begin the document analysis process using Azure's Document Intelligence service SyncPoller analyzeLayoutResultPoller = - client.beginAnalyzeDocument(this.modelId, fileData); + client.beginAnalyzeDocument(this.modelId, fileData); + // Get the final result of the document analysis AnalyzeResult analyzeLayoutResult = analyzeLayoutResultPoller.getFinalResult(); int offset = 0; + // Loop through each page in the analyzed document for (int page_num = 0; page_num < analyzeLayoutResult.getPages().size(); page_num++) { DocumentPage page = analyzeLayoutResult.getPages().get(page_num); + + // Create a list to store the tables on the current page List tables_on_page = new ArrayList<>(); - if(analyzeLayoutResult.getTables() != null){ + // If there are tables in the analyzed document, add the tables on the current page to the list + if (analyzeLayoutResult.getTables() != null) { for (DocumentTable table : analyzeLayoutResult.getTables()) { BoundingRegion boundingRegion = table.getBoundingRegions().get(0); if (boundingRegion.getPageNumber() == page_num + 1) { @@ -85,19 +103,24 @@ private List parse(BinaryData fileData) { } } } - + DocumentSpan pageSpan = page.getSpans().get(0); int pageOffset = pageSpan.getOffset(); int pageLength = pageSpan.getLength(); + + // Create an array to store the characters in the tables on the current page int[] tableChars = new int[pageLength]; Arrays.fill(tableChars, -1); + // Loop through each table on the current page for (int tableId = 0; tableId < tables_on_page.size(); tableId++) { DocumentTable table = tables_on_page.get(tableId); - + + // Loop through each span in the current table and mark the characters in the table for (DocumentSpan span : table.getSpans()) { for (int i = 0; i < span.getLength(); i++) { int idx = span.getOffset() - pageOffset + i; + // If the character is in the current table, store the table ID in the array if (idx >= 0 && idx < pageLength) { tableChars[idx] = tableId; } @@ -105,25 +128,34 @@ private List parse(BinaryData fileData) { } } + // Create a StringBuilder to store the text of the current page StringBuilder pageText = new StringBuilder(); + + // Create a set to store the IDs of the tables that have been added to the page text Set addedTables = new HashSet<>(); + + // Loop through each character in the array for (int idx = 0; idx < tableChars.length; idx++) { int tableId = tableChars[idx]; if (tableId == -1) { + // If the character is not in a table, add it to the page text pageText.append(analyzeLayoutResult.getContent().substring(pageOffset + idx, pageOffset + idx + 1)); } else if (!addedTables.contains(tableId)) { + // If the character is in a table and the table has not been added to the page text, add the table to the page text DocumentTable table = tables_on_page.get(tableId); pageText.append(tableToHtml(table)); addedTables.add(tableId); } } - pages.add( new Page(page_num, offset, pageText.toString())); + // Add the current page to the list of pages + pages.add(new Page(page_num, offset, pageText.toString())); + offset += pageText.length(); - } + } return pages; - } + } private String tableToHtml(DocumentTable table) { diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/ItextPDFParser.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/ItextPDFParser.java index 728308f..ae87922 100644 --- a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/ItextPDFParser.java +++ b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/ItextPDFParser.java @@ -8,6 +8,11 @@ import java.util.List; import java.util.ArrayList; +/** + * This is an implementation of a PDF parser using open source iText library. + * It can only handle text within pdf. + * Can't extract data from tables within images. See @DocumentIntelligencePDFParser for that. + */ public class ItextPDFParser implements PDFParser { @Override public List parse(File file) { diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/TextSplitter.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/TextSplitter.java index d6664e2..faf2051 100644 --- a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/TextSplitter.java +++ b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/TextSplitter.java @@ -6,6 +6,11 @@ import com.microsoft.openai.samples.indexer.SplitPage; +/** + * It's responsible for splitting the text content of a list of pages into smaller sections. + * It does this by identifying sentence endings and word breaks, and then using these to determine where to split the text. + * The class also has a maximum section length, a sentence search limit, and a section overlap, which are used to fine-tune the splitting process. + */ public class TextSplitter { private List sentenceEndings; private List wordBreaks; @@ -15,6 +20,10 @@ public class TextSplitter { private boolean verbose; public TextSplitter(boolean verbose) { + this(true,1000,100,100) + } + + public TextSplitter(boolean verbose, int maxSectionLength, int sentenceSearchLimit, int sectionOverlap) { this.sentenceEndings = new ArrayList<>(); this.sentenceEndings.add("."); this.sentenceEndings.add("。"); @@ -41,12 +50,11 @@ public TextSplitter(boolean verbose) { this.wordBreaks.add("\t"); this.wordBreaks.add("\n"); - this.maxSectionLength = 1000; - this.sentenceSearchLimit = 100; - this.sectionOverlap = 100; + this.maxSectionLength = maxSectionLength; + this.sentenceSearchLimit = sentenceSearchLimit; + this.sectionOverlap = sectionOverlap; this.verbose = verbose; } - public List splitPages(List pages) { List splitPages = new ArrayList<>(); StringBuilder allText = new StringBuilder();