Skip to content

Commit

Permalink
Fix prepdoc source not found (#235)
Browse files Browse the repository at this point in the history
## Purpose
<!-- Describe the intention of the changes being proposed. What problem
does it solve or functionality does it add? -->
* ...

## Does this introduce a breaking change?
<!-- Mark one with an "x". -->
```
[ ] Yes
[ ] No
```

## Pull Request Type
What kind of change does this Pull Request introduce?

<!-- Please check the one that applies to this PR using "x". -->
```
[ ] Bugfix
[ ] Feature
[ ] Code style update (formatting, local variables)
[ ] Refactoring (no functional changes, no api changes)
[ ] Documentation content changes
[ ] Other... Please describe:
```

## How to Test
*  Get the code

```
git clone [repo-address]
cd [repo-name]
git checkout [branch-name]
npm install
```

* Test the code
<!-- Add steps to run the tests suite and/or manually test -->
```
```

## What to Check
Verify that the following are valid
* ...

## Other Information
<!-- Add any other helpful information that may be needed here. -->
  • Loading branch information
LittleLittleCloud authored Nov 9, 2023
1 parent 1e2b031 commit a16ae58
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public async Task<bool> EmbedBlobAsync(Stream blobStream, string blobName)
try
{
await EnsureSearchIndexAsync(searchIndexName);

Console.WriteLine($"Embedding blob '{blobName}'");
var pageMap = await GetDocumentTextAsync(blobStream, blobName);

var fileNameWithoutExtension = Path.GetFileNameWithoutExtension(blobName);
Expand Down Expand Up @@ -128,6 +128,7 @@ private async Task<IReadOnlyList<PageDetail>> GetDocumentTextAsync(Stream blobSt
logger?.LogInformation(
"Extracting text from '{Blob}' using Azure Form Recognizer", blobName);

Console.WriteLine($"Extracting text from '{blobName}' using Azure Form Recognizer");
using var ms = new MemoryStream();
blobStream.CopyTo(ms);
ms.Position = 0;
Expand Down Expand Up @@ -184,7 +185,7 @@ private async Task<IReadOnlyList<PageDetail>> GetDocumentTextAsync(Stream blobSt
pageMap.Add(new PageDetail(i, offset, pageText.ToString()));
offset += pageText.Length;
}

Console.WriteLine($"Extracted {pageMap.Count} pages from '{blobName}'");
return pageMap.AsReadOnly();
}

Expand Down Expand Up @@ -374,9 +375,7 @@ private static int FindPage(IReadOnlyList<PageDetail> pageMap, int offset)
return length - 1;
}

private static string BlobNameFromFilePage(string blobName, int page = 0) => Path.GetExtension(blobName).ToLower() is ".pdf"
? $"{Path.GetFileNameWithoutExtension(blobName)}-{page}.pdf"
: Path.GetFileName(blobName);
private static string BlobNameFromFilePage(string blobName, int page = 0) => blobName;

private async Task IndexSectionsAsync(string searchIndexName, IEnumerable<Section> sections, string blobName)
{
Expand Down
16 changes: 9 additions & 7 deletions app/prepdocs/PrepareDocs/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,7 @@ static async Task ProcessSingleFileAsync(AppOptions options, string fileName, IE
return;
}

await UploadBlobsAsync(options, fileName);
using (var stream = File.OpenRead(fileName))
{
await embedService.EmbedBlobAsync(stream, fileName);
}
await UploadBlobsAndCreateIndexAsync(options, fileName, embedService);
}
}
});
Expand Down Expand Up @@ -159,8 +155,8 @@ Removing sections from '{fileName ?? "all"}' from search index '{options.SearchI
}
}

static async ValueTask UploadBlobsAsync(
AppOptions options, string fileName)
static async ValueTask UploadBlobsAndCreateIndexAsync(
AppOptions options, string fileName, IEmbedService embeddingService)
{
var container = await GetBlobContainerClientAsync(options);

Expand Down Expand Up @@ -190,6 +186,11 @@ static async ValueTask UploadBlobsAsync(
{
ContentType = "application/pdf"
});

// revert stream position
stream.Position = 0;

await embeddingService.EmbedBlobAsync(stream, documentName);
}
finally
{
Expand All @@ -201,6 +202,7 @@ static async ValueTask UploadBlobsAsync(
{
var blobName = BlobNameFromFilePage(fileName);
await UploadBlobAsync(fileName, blobName, container);
await embeddingService.EmbedBlobAsync(File.OpenRead(fileName), blobName);
}
}

Expand Down

0 comments on commit a16ae58

Please sign in to comment.