diff --git a/app/functions/EmbedFunctions/Services/AzureSearchEmbedService.cs b/app/functions/EmbedFunctions/Services/AzureSearchEmbedService.cs index d9ae4169..1e7ff8da 100644 --- a/app/functions/EmbedFunctions/Services/AzureSearchEmbedService.cs +++ b/app/functions/EmbedFunctions/Services/AzureSearchEmbedService.cs @@ -24,7 +24,7 @@ public async Task EmbedBlobAsync(Stream blobStream, string blobName) try { await EnsureSearchIndexAsync(searchIndexName); - + Console.WriteLine($"Embedding blob '{blobName}'"); var pageMap = await GetDocumentTextAsync(blobStream, blobName); var fileNameWithoutExtension = Path.GetFileNameWithoutExtension(blobName); @@ -128,6 +128,7 @@ private async Task> GetDocumentTextAsync(Stream blobSt logger?.LogInformation( "Extracting text from '{Blob}' using Azure Form Recognizer", blobName); + Console.WriteLine($"Extracting text from '{blobName}' using Azure Form Recognizer"); using var ms = new MemoryStream(); blobStream.CopyTo(ms); ms.Position = 0; @@ -184,7 +185,7 @@ private async Task> GetDocumentTextAsync(Stream blobSt pageMap.Add(new PageDetail(i, offset, pageText.ToString())); offset += pageText.Length; } - + Console.WriteLine($"Extracted {pageMap.Count} pages from '{blobName}'"); return pageMap.AsReadOnly(); } @@ -374,9 +375,7 @@ private static int FindPage(IReadOnlyList pageMap, int offset) return length - 1; } - private static string BlobNameFromFilePage(string blobName, int page = 0) => Path.GetExtension(blobName).ToLower() is ".pdf" - ? $"{Path.GetFileNameWithoutExtension(blobName)}-{page}.pdf" - : Path.GetFileName(blobName); + private static string BlobNameFromFilePage(string blobName, int page = 0) => blobName; private async Task IndexSectionsAsync(string searchIndexName, IEnumerable
sections, string blobName) { diff --git a/app/prepdocs/PrepareDocs/Program.cs b/app/prepdocs/PrepareDocs/Program.cs index 794acd50..05b4ec0d 100644 --- a/app/prepdocs/PrepareDocs/Program.cs +++ b/app/prepdocs/PrepareDocs/Program.cs @@ -59,11 +59,7 @@ static async Task ProcessSingleFileAsync(AppOptions options, string fileName, IE return; } - await UploadBlobsAsync(options, fileName); - using (var stream = File.OpenRead(fileName)) - { - await embedService.EmbedBlobAsync(stream, fileName); - } + await UploadBlobsAndCreateIndexAsync(options, fileName, embedService); } } }); @@ -159,8 +155,8 @@ Removing sections from '{fileName ?? "all"}' from search index '{options.SearchI } } -static async ValueTask UploadBlobsAsync( - AppOptions options, string fileName) +static async ValueTask UploadBlobsAndCreateIndexAsync( + AppOptions options, string fileName, IEmbedService embeddingService) { var container = await GetBlobContainerClientAsync(options); @@ -190,6 +186,11 @@ static async ValueTask UploadBlobsAsync( { ContentType = "application/pdf" }); + + // revert stream position + stream.Position = 0; + + await embeddingService.EmbedBlobAsync(stream, documentName); } finally { @@ -201,6 +202,7 @@ static async ValueTask UploadBlobsAsync( { var blobName = BlobNameFromFilePage(fileName); await UploadBlobAsync(fileName, blobName, container); + await embeddingService.EmbedBlobAsync(File.OpenRead(fileName), blobName); } }