From e3159edfc2655ac823b7371cefc0070cb8ae91c5 Mon Sep 17 00:00:00 2001 From: sinedied Date: Tue, 12 Dec 2023 09:26:43 +0100 Subject: [PATCH] refactor(indexer): extract handlers --- packages/indexer/src/lib/document-processor.ts | 9 ++------- packages/indexer/src/lib/indexer.ts | 4 ++++ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/packages/indexer/src/lib/document-processor.ts b/packages/indexer/src/lib/document-processor.ts index f5ee7a25..9f1aae89 100644 --- a/packages/indexer/src/lib/document-processor.ts +++ b/packages/indexer/src/lib/document-processor.ts @@ -1,7 +1,6 @@ import { type BaseLogger } from 'pino'; import { getBlobNameFromFile } from './blob-storage.js'; import { type ContentPage, type ContentSection, type Section } from './document.js'; -import { extractText, extractTextFromPdf } from './formats/index.js'; const SENTENCE_ENDINGS = new Set(['.', '!', '?']); const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']); @@ -12,11 +11,7 @@ const SECTION_OVERLAP = 100; export class DocumentProcessor { formatHandlers = new Map Promise>(); - constructor(private logger: BaseLogger) { - this.registerFormatHandler('text/plain', extractText); - this.registerFormatHandler('text/markdown', extractText); - this.registerFormatHandler('application/pdf', extractTextFromPdf); - } + constructor(private logger: BaseLogger) {} async createDocumentFromFile(filename: string, data: Buffer, type: string, category: string) { const pages = await this.extractText(data, type); @@ -25,7 +20,7 @@ export class DocumentProcessor { return { filename, type, category, sections }; } - private registerFormatHandler(type: string, handler: (data: Buffer) => Promise) { + public registerFormatHandler(type: string, handler: (data: Buffer) => Promise) { this.formatHandlers.set(type, handler); } diff --git a/packages/indexer/src/lib/indexer.ts b/packages/indexer/src/lib/indexer.ts index af9f4354..41e9779a 100644 --- a/packages/indexer/src/lib/indexer.ts +++ b/packages/indexer/src/lib/indexer.ts @@ -5,6 +5,7 @@ import { type AzureClients } from '../plugins/azure.js'; import { type OpenAiService } from '../plugins/openai.js'; import { wait } from './util/index.js'; import { DocumentProcessor } from './document-processor.js'; +import { extractText, extractTextFromPdf } from './formats/index.js'; import { MODELS_SUPPORTED_BATCH_SIZE } from './model-limits.js'; import { BlobStorage } from './blob-storage.js'; import { type Section } from './document.js'; @@ -137,6 +138,9 @@ export class Indexer { } const documentProcessor = new DocumentProcessor(this.logger); + documentProcessor.registerFormatHandler('text/plain', extractText); + documentProcessor.registerFormatHandler('text/markdown', extractText); + documentProcessor.registerFormatHandler('application/pdf', extractTextFromPdf); const document = await documentProcessor.createDocumentFromFile(filename, data, type, category); const sections = document.sections; if (options.useVectors) {