From c9f7502af3dfa0f29cc8260c1aacbdf0aab09d8d Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Thu, 30 Jan 2025 21:34:17 -0800 Subject: [PATCH] knowpro: Timestamp Scope (#643) Experiments: * Timestamp index * Scope queries using date ranges --- ts/examples/chat/src/memory/common.ts | 35 +- ts/examples/chat/src/memory/knowproMemory.ts | 322 +++++------------- ts/examples/chat/src/memory/knowproPrinter.ts | 201 +++++++++++ ts/examples/chat/src/memory/podcastMemory.ts | 3 +- ts/packages/knowPro/src/accumulators.ts | 25 +- ts/packages/knowPro/src/dataFormat.ts | 10 + ts/packages/knowPro/src/import.ts | 90 +++-- ts/packages/knowPro/src/query.ts | 95 +++--- ts/packages/knowPro/src/search.ts | 19 +- ts/packages/knowPro/src/timestampIndex.ts | 81 +++++ 10 files changed, 542 insertions(+), 339 deletions(-) create mode 100644 ts/examples/chat/src/memory/knowproPrinter.ts create mode 100644 ts/packages/knowPro/src/timestampIndex.ts diff --git a/ts/examples/chat/src/memory/common.ts b/ts/examples/chat/src/memory/common.ts index 703636bfb..1dc0395b3 100644 --- a/ts/examples/chat/src/memory/common.ts +++ b/ts/examples/chat/src/memory/common.ts @@ -172,14 +172,37 @@ export function argChunkSize(defaultValue?: number | undefined): ArgDef { }; } -export function argToDate(value: string | undefined): Date | undefined { - return value ? dateTime.stringToDate(value) : undefined; +export function recordFromArgs( + args: NamedArgs, + metadata?: CommandMetadata, +): Record { + const record: Record = {}; + const keys = Object.keys(args); + for (const key of keys) { + const value = args[key]; + if (typeof value !== "function") { + record[key] = value; + } + } + if (metadata !== undefined) { + if (metadata.args) { + removeKeysFromRecord(record, Object.keys(metadata.args)); + } + if (metadata.options) { + removeKeysFromRecord(record, Object.keys(metadata.options)); + } + } + return record; +} + +function removeKeysFromRecord(record: Record, keys: string[]) { + for (const key of keys) { + delete record[key]; + } } -export function addMinutesToDate(date: Date, minutes: number): Date { - const time = date.getTime(); - const offsetMs = minutes * 60 * 1000; - return new Date(time + offsetMs); +export function argToDate(value: string | undefined): Date | undefined { + return value ? dateTime.stringToDate(value) : undefined; } export function parseFreeAndNamedArguments( diff --git a/ts/examples/chat/src/memory/knowproMemory.ts b/ts/examples/chat/src/memory/knowproMemory.ts index 0b50bb0b7..f106356f8 100644 --- a/ts/examples/chat/src/memory/knowproMemory.ts +++ b/ts/examples/chat/src/memory/knowproMemory.ts @@ -16,16 +16,18 @@ import { import { ChatContext } from "./chatMemory.js"; import { ChatModel } from "aiclient"; import fs from "fs"; -import { ChatPrinter } from "../chatPrinter.js"; import { addFileNameSuffixToPath, argDestFile, argSourceFile, + argToDate, parseFreeAndNamedArguments, + recordFromArgs, } from "./common.js"; -import { ensureDir, readJsonFile, writeJsonFile } from "typeagent"; +import { dateTime, ensureDir, readJsonFile, writeJsonFile } from "typeagent"; import path from "path"; import chalk from "chalk"; +import { KnowProPrinter } from "./knowproPrinter.js"; type KnowProContext = { knowledgeModel: ChatModel; @@ -46,16 +48,39 @@ export async function createKnowproCommands( }; await ensureDir(context.basePath); + commands.kpPodcastMessages = showMessages; commands.kpPodcastImport = podcastImport; + commands.kpPodcastTimestamp = podcastTimestamp; commands.kpPodcastSave = podcastSave; commands.kpPodcastLoad = podcastLoad; commands.kpSearchTerms = searchTerms; - commands.kpSearchEntities = searchEntities; + commands.kpEntities = entities; commands.kpPodcastBuildIndex = podcastBuildIndex; /*---------------- * COMMANDS *---------------*/ + function showMessagesDef(): CommandMetadata { + return { + description: "Show all messages", + options: { + maxMessages: argNum("Maximum messages to display"), + }, + }; + } + commands.kpPodcastMessages.metadata = "Show all messages"; + async function showMessages(args: string[]) { + const conversation = ensureConversationLoaded(); + if (!conversation) { + return; + } + const namedArgs = parseNamedArguments(args, showMessagesDef()); + const messages = + namedArgs.maxMessages > 0 + ? conversation.messages.slice(0, namedArgs.maxMessages) + : conversation.messages; + messages.forEach((m) => context.printer.writeMessage(m)); + } function podcastImportDef(): CommandMetadata { return { @@ -97,6 +122,29 @@ export async function createKnowproCommands( await podcastSave(namedArgs); } + function podcastTimestampDef(): CommandMetadata { + return { + description: "Set timestamps", + args: { + startAt: arg("Start date and time"), + }, + options: { + length: argNum("Length of the podcast in minutes", 60), + }, + }; + } + commands.kpPodcastTimestamp.metadata = podcastTimestampDef(); + async function podcastTimestamp(args: string[]) { + const conversation = ensureConversationLoaded(); + if (!conversation) { + return; + } + const namedArgs = parseNamedArguments(args, podcastTimestampDef()); + const startAt = argToDate(namedArgs.startAt)!; + const endAt = dateTime.addMinutesToDate(startAt, namedArgs.length); + kp.timestampMessages(conversation.messages, startAt, endAt); + } + function podcastSaveDef(): CommandMetadata { return { description: "Save Podcast", @@ -160,14 +208,22 @@ export async function createKnowproCommands( context.printer.writePodcastInfo(context.podcast); } - function searchTermsDef(): CommandMetadata { - return { - description: "Search current knowPro conversation by terms", + function searchTermsDef( + description?: string, + kType?: kp.KnowledgeType, + ): CommandMetadata { + const meta: CommandMetadata = { + description: + description ?? "Search current knowPro conversation by terms", options: { maxToDisplay: argNum("Maximum matches to display", 25), - ktype: arg("Knowledge type"), }, }; + if (kType === undefined) { + meta.options!.ktype = arg("Knowledge type"); + } + + return meta; } commands.kpSearchTerms.metadata = searchTermsDef(); async function searchTerms(args: string[]): Promise { @@ -178,11 +234,12 @@ export async function createKnowproCommands( if (!conversation) { return; } + const commandDef = searchTermsDef(); let [termArgs, namedArgs] = parseFreeAndNamedArguments( args, - searchTermsDef(), + commandDef, ); - const terms = parseQueryTerms(termArgs); // Todo: De dupe + const terms = parseQueryTerms(termArgs); if (conversation.semanticRefIndex && conversation.semanticRefs) { context.printer.writeInColor( chalk.cyan, @@ -192,7 +249,7 @@ export async function createKnowproCommands( const matches = await kp.searchConversation( conversation, terms, - filterFromArgs(namedArgs), + filterFromArgs(namedArgs, commandDef), ); if (matches === undefined || matches.size === 0) { context.printer.writeLine("No matches"); @@ -210,47 +267,37 @@ export async function createKnowproCommands( } } - function filterFromArgs(namedArgs: NamedArgs) { - let filter: kp.SearchFilter = { type: namedArgs.ktype }; - let argCopy = { ...namedArgs }; - delete argCopy.maxToDisplay; - delete argCopy.ktype; - let keys = Object.keys(argCopy); - if (keys.length > 0) { - for (const key of keys) { - const value = argCopy[key]; - if (typeof value === "function") { - delete argCopy[key]; - } - } - if (Object.keys(argCopy).length > 0) { - filter.propertiesToMatch = argCopy; - } - } + function filterFromArgs(namedArgs: NamedArgs, metadata: CommandMetadata) { + let filter: kp.SearchFilter = { + type: namedArgs.ktype, + propertiesToMatch: recordFromArgs(namedArgs, metadata), + }; return filter; } function entitiesDef(): CommandMetadata { - return { - description: "Display entities in current conversation", - }; + return searchTermsDef( + "Search entities in current conversation", + "entity", + ); } - commands.kpSearchEntities.metadata = entitiesDef(); - async function searchEntities(args: string[]): Promise { + commands.kpEntities.metadata = entitiesDef(); + async function entities(args: string[]): Promise { const conversation = ensureConversationLoaded(); if (!conversation) { return; } if (args.length > 0) { + args.push("--ktype"); + args.push("entity"); + await searchTerms(args); } else { - // - // Display all entities - // - const matches = filterSemanticRefsByType( - conversation.semanticRefs, - "entity", - ); - context.printer.writeSemanticRefs(matches); + if (conversation.semanticRefs !== undefined) { + const entities = conversation.semanticRefs?.filter( + (sr) => sr.knowledgeType === "entity", + ); + context.printer.writeSemanticRefs(entities); + } } } @@ -349,201 +396,6 @@ export async function createKnowproCommands( } } -class KnowProPrinter extends ChatPrinter { - constructor() { - super(); - } - - public writeEntity( - entity: knowLib.conversation.ConcreteEntity | undefined, - ) { - if (entity !== undefined) { - this.writeLine(entity.name.toUpperCase()); - this.writeList(entity.type, { type: "csv" }); - if (entity.facets) { - const facetList = entity.facets.map((f) => - knowLib.conversation.facetToString(f), - ); - this.writeList(facetList, { type: "ul" }); - } - } - return this; - } - - public writeAction(action: knowLib.conversation.Action | undefined) { - if (action !== undefined) { - this.writeLine(knowLib.conversation.actionToString(action)); - } - } - - public writeTopic(topic: kp.ITopic | undefined) { - if (topic !== undefined) { - this.writeLine(topic.text); - } - } - - public writeSemanticRef(semanticRef: kp.SemanticRef) { - switch (semanticRef.knowledgeType) { - default: - this.writeLine(semanticRef.knowledgeType); - break; - case "entity": - this.writeEntity( - semanticRef.knowledge as knowLib.conversation.ConcreteEntity, - ); - break; - case "action": - this.writeAction( - semanticRef.knowledge as knowLib.conversation.Action, - ); - break; - case "topic": - this.writeTopic(semanticRef.knowledge as kp.ITopic); - break; - } - return this; - } - - public writeSemanticRefs(refs: kp.SemanticRef[] | undefined) { - if (refs && refs.length > 0) { - for (const ref of refs) { - this.writeSemanticRef(ref); - this.writeLine(); - } - } - return this; - } - - public writeScoredSemanticRefs( - semanticRefMatches: kp.ScoredSemanticRef[], - semanticRefs: kp.SemanticRef[], - maxToDisplay: number, - ) { - this.writeLine( - `Displaying ${maxToDisplay} matches of total ${semanticRefMatches.length}`, - ); - for (let i = 0; i < maxToDisplay; ++i) { - const match = semanticRefMatches[i]; - const semanticRef = semanticRefs[match.semanticRefIndex]; - - this.writeInColor( - chalk.green, - `#${i + 1}: ${semanticRef.knowledgeType} [${match.score}]`, - ); - this.writeSemanticRef(semanticRef); - this.writeLine(); - } - } - - public writeSearchResult( - conversation: kp.IConversation, - result: kp.SearchResult | undefined, - maxToDisplay: number, - ) { - if (result) { - this.writeListInColor(chalk.cyanBright, result.termMatches, { - title: "Matched terms", - type: "ol", - }); - maxToDisplay = Math.min( - result.semanticRefMatches.length, - maxToDisplay, - ); - this.writeScoredSemanticRefs( - result.semanticRefMatches, - conversation.semanticRefs!, - maxToDisplay, - ); - } - } - - public writeSearchResults( - conversation: kp.IConversation, - results: Map, - maxToDisplay: number, - ) { - // Do entities before actions... - this.writeResult(conversation, "entity", results, maxToDisplay); - this.writeResult(conversation, "action", results, maxToDisplay); - this.writeResult(conversation, "topic", results, maxToDisplay); - this.writeResult(conversation, "tag", results, maxToDisplay); - } - - private writeResult( - conversation: kp.IConversation, - type: kp.KnowledgeType, - results: Map, - maxToDisplay: number, - ) { - const result = results.get(type); - if (result !== undefined) { - this.writeTitle(type.toUpperCase()); - this.writeSearchResult(conversation, result, maxToDisplay); - } - } - - public writeConversationInfo(conversation: kp.IConversation) { - this.writeTitle(conversation.nameTag); - this.writeLine(`${conversation.messages.length} messages`); - return this; - } - - public writePodcastInfo(podcast: kp.Podcast) { - this.writeConversationInfo(podcast); - this.writeList(getPodcastParticipants(podcast), { - type: "csv", - title: "Participants", - }); - } - - public writeIndexingResults( - results: kp.ConversationIndexingResult, - verbose = false, - ) { - if (results.failedMessages.length > 0) { - this.writeError( - `Errors for ${results.failedMessages.length} messages`, - ); - if (verbose) { - for (const failedMessage of results.failedMessages) { - this.writeInColor( - chalk.cyan, - failedMessage.message.textChunks[0], - ); - this.writeError(failedMessage.error); - } - } - } - } -} - -export function filterSemanticRefsByType( - semanticRefs: kp.SemanticRef[] | undefined, - type: string, -): kp.SemanticRef[] { - const matches: kp.SemanticRef[] = []; - if (semanticRefs) { - for (const ref of semanticRefs) { - if (ref.knowledgeType === type) { - matches.push(ref); - } - } - } - return matches; -} - -export function getPodcastParticipants(podcast: kp.Podcast) { - const participants = new Set(); - for (let message of podcast.messages) { - const meta = message.metadata; - if (meta.speaker) { - participants.add(meta.speaker); - } - meta.listeners.forEach((l) => participants.add(l)); - } - return [...participants.values()]; -} - export function parseQueryTerms(args: string[]): kp.QueryTerm[] { const queryTerms: kp.QueryTerm[] = []; for (const arg of args) { diff --git a/ts/examples/chat/src/memory/knowproPrinter.ts b/ts/examples/chat/src/memory/knowproPrinter.ts new file mode 100644 index 000000000..15e8a7ddd --- /dev/null +++ b/ts/examples/chat/src/memory/knowproPrinter.ts @@ -0,0 +1,201 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import * as kp from "knowpro"; +import * as knowLib from "knowledge-processor"; +import { ChatPrinter } from "../chatPrinter.js"; +import chalk from "chalk"; + +export class KnowProPrinter extends ChatPrinter { + constructor() { + super(); + } + + public writeMessage(message: kp.IMessage) { + const prevColor = this.setForeColor(chalk.cyan); + try { + this.writeNameValue("Timestamp", message.timestamp); + this.writeJson(message.metadata); + } finally { + this.setForeColor(prevColor); + } + for (const chunk of message.textChunks) { + this.write(chunk); + } + this.writeLine(); + } + + public writeEntity( + entity: knowLib.conversation.ConcreteEntity | undefined, + ) { + if (entity !== undefined) { + this.writeLine(entity.name.toUpperCase()); + this.writeList(entity.type, { type: "csv" }); + if (entity.facets) { + const facetList = entity.facets.map((f) => + knowLib.conversation.facetToString(f), + ); + this.writeList(facetList, { type: "ul" }); + } + } + return this; + } + + public writeAction(action: knowLib.conversation.Action | undefined) { + if (action !== undefined) { + this.writeLine(knowLib.conversation.actionToString(action)); + } + } + + public writeTopic(topic: kp.ITopic | undefined) { + if (topic !== undefined) { + this.writeLine(topic.text); + } + } + + public writeSemanticRef(semanticRef: kp.SemanticRef) { + switch (semanticRef.knowledgeType) { + default: + this.writeLine(semanticRef.knowledgeType); + break; + case "entity": + this.writeEntity( + semanticRef.knowledge as knowLib.conversation.ConcreteEntity, + ); + break; + case "action": + this.writeAction( + semanticRef.knowledge as knowLib.conversation.Action, + ); + break; + case "topic": + this.writeTopic(semanticRef.knowledge as kp.ITopic); + break; + } + return this; + } + + public writeSemanticRefs(refs: kp.SemanticRef[] | undefined) { + if (refs && refs.length > 0) { + for (const ref of refs) { + this.writeSemanticRef(ref); + this.writeLine(); + } + } + return this; + } + + public writeScoredSemanticRefs( + semanticRefMatches: kp.ScoredSemanticRef[], + semanticRefs: kp.SemanticRef[], + maxToDisplay: number, + ) { + this.writeLine( + `Displaying ${maxToDisplay} matches of total ${semanticRefMatches.length}`, + ); + for (let i = 0; i < maxToDisplay; ++i) { + const match = semanticRefMatches[i]; + const semanticRef = semanticRefs[match.semanticRefIndex]; + + this.writeInColor( + chalk.green, + `#${i + 1}: ${semanticRef.knowledgeType} [${match.score}]`, + ); + this.writeSemanticRef(semanticRef); + this.writeLine(); + } + } + + public writeSearchResult( + conversation: kp.IConversation, + result: kp.SearchResult | undefined, + maxToDisplay: number, + ) { + if (result) { + this.writeListInColor(chalk.cyanBright, result.termMatches, { + title: "Matched terms", + type: "ol", + }); + maxToDisplay = Math.min( + result.semanticRefMatches.length, + maxToDisplay, + ); + this.writeScoredSemanticRefs( + result.semanticRefMatches, + conversation.semanticRefs!, + maxToDisplay, + ); + } + } + + public writeSearchResults( + conversation: kp.IConversation, + results: Map, + maxToDisplay: number, + ) { + // Do entities before actions... + this.writeResult(conversation, "entity", results, maxToDisplay); + this.writeResult(conversation, "action", results, maxToDisplay); + this.writeResult(conversation, "topic", results, maxToDisplay); + this.writeResult(conversation, "tag", results, maxToDisplay); + } + + private writeResult( + conversation: kp.IConversation, + type: kp.KnowledgeType, + results: Map, + maxToDisplay: number, + ) { + const result = results.get(type); + if (result !== undefined) { + this.writeTitle(type.toUpperCase()); + this.writeSearchResult(conversation, result, maxToDisplay); + } + } + + public writeConversationInfo(conversation: kp.IConversation) { + this.writeTitle(conversation.nameTag); + this.writeLine(`${conversation.messages.length} messages`); + return this; + } + + public writePodcastInfo(podcast: kp.Podcast) { + this.writeConversationInfo(podcast); + this.writeList(getPodcastParticipants(podcast), { + type: "csv", + title: "Participants", + }); + } + + public writeIndexingResults( + results: kp.ConversationIndexingResult, + verbose = false, + ) { + if (results.failedMessages.length > 0) { + this.writeError( + `Errors for ${results.failedMessages.length} messages`, + ); + if (verbose) { + for (const failedMessage of results.failedMessages) { + this.writeInColor( + chalk.cyan, + failedMessage.message.textChunks[0], + ); + this.writeError(failedMessage.error); + } + } + } + } +} + +function getPodcastParticipants(podcast: kp.Podcast) { + const participants = new Set(); + for (let message of podcast.messages) { + const meta = message.metadata; + if (meta.speaker) { + participants.add(meta.speaker); + } + meta.listeners.forEach((l) => participants.add(l)); + } + return [...participants.values()]; +} diff --git a/ts/examples/chat/src/memory/podcastMemory.ts b/ts/examples/chat/src/memory/podcastMemory.ts index 1eec19e25..3af569844 100644 --- a/ts/examples/chat/src/memory/podcastMemory.ts +++ b/ts/examples/chat/src/memory/podcastMemory.ts @@ -21,7 +21,6 @@ import { parseNamedArguments, } from "interactive-app"; import { - addMinutesToDate, argClean, argPause, argSourceFileOrFolder, @@ -480,7 +479,7 @@ export function createPodcastCommands( const sourcePath = namedArgs.sourcePath; const startAt = argToDate(namedArgs.startAt); const endAt = startAt - ? addMinutesToDate(startAt, namedArgs.length) + ? dateTime.addMinutesToDate(startAt, namedArgs.length) : undefined; await importTranscript(sourcePath, startAt, endAt); } diff --git a/ts/packages/knowPro/src/accumulators.ts b/ts/packages/knowPro/src/accumulators.ts index d44aabd0b..a601e830b 100644 --- a/ts/packages/knowPro/src/accumulators.ts +++ b/ts/packages/knowPro/src/accumulators.ts @@ -144,19 +144,6 @@ export class MatchAccumulator { return topN.length; } - public union(other: MatchAccumulator): void { - for (const matchFrom of other.matches.values()) { - const matchTo = this.matches.get(matchFrom.value); - if (matchTo !== undefined) { - // Existing - matchTo.hitCount += matchFrom.hitCount; - matchTo.score += matchFrom.score; - } else { - this.matches.set(matchFrom.value, matchFrom); - } - } - } - private matchesWithMinHitCount( minHitCount: number | undefined, ): IterableIterator> { @@ -282,10 +269,8 @@ export class SemanticRefAccumulator extends MatchAccumulator { } private getMinHitCount(minHitCount?: number): number { - return minHitCount !== undefined - ? minHitCount - : //: this.queryTermMatches.termMatches.size; - this.maxHits; + return minHitCount !== undefined ? minHitCount : this.maxHits; + //: this.queryTermMatches.termMatches.size; } } @@ -346,6 +331,12 @@ export class TextRangeAccumulator { textRanges.push(textRange); } + public addRanges(textRanges: TextRange[]) { + for (const range of textRanges) { + this.addRange(range); + } + } + public isInRange(textRange: TextRange): boolean { const textRanges = this.rangesForMessage.get( textRange.start.messageIndex, diff --git a/ts/packages/knowPro/src/dataFormat.ts b/ts/packages/knowPro/src/dataFormat.ts index b89cbd767..27e8fb0b8 100644 --- a/ts/packages/knowPro/src/dataFormat.ts +++ b/ts/packages/knowPro/src/dataFormat.ts @@ -76,6 +76,7 @@ export interface IConversation { semanticRefIndex?: ITermToSemanticRefIndex | undefined; semanticRefs: SemanticRef[] | undefined; relatedTermsIndex?: ITermToRelatedTermsIndex | undefined; + timestampIndex?: ITimestampToMessageIndex | undefined; } export type MessageIndex = number; @@ -138,3 +139,12 @@ export interface ITextEmbeddingDataItem { text: string; embedding: number[]; } + +export type DateRange = { + start: Date; + end?: Date | undefined; +}; + +export interface ITimestampToMessageIndex { + getTextRange(dateRange: DateRange): TextRange[]; +} diff --git a/ts/packages/knowPro/src/import.ts b/ts/packages/knowPro/src/import.ts index a5be529e2..e8631341f 100644 --- a/ts/packages/knowPro/src/import.ts +++ b/ts/packages/knowPro/src/import.ts @@ -10,7 +10,7 @@ import { ITextEmbeddingData, } from "./dataFormat.js"; import { conversation, split } from "knowledge-processor"; -import { collections, getFileName, readAllText } from "typeagent"; +import { collections, dateTime, getFileName, readAllText } from "typeagent"; import { ConversationIndex, addActionToIndex, @@ -26,6 +26,7 @@ import { SemanticIndexSettings, TermSemanticIndex, } from "./termIndex.js"; +import { TimestampToMessageIndex } from "./timestampIndex.js"; // metadata for podcast messages export class PodcastMessageMeta implements IKnowledgeSource { @@ -122,6 +123,7 @@ export class Podcast implements IConversation { public semanticRefs: SemanticRef[] = [], public semanticRefIndex: ConversationIndex | undefined = undefined, public relatedTermsIndex: TermSemanticIndex | undefined = undefined, + public timestampIndex: TimestampToMessageIndex | undefined = undefined, ) { this.settings = createPodcastSettings(); } @@ -159,28 +161,14 @@ export class Podcast implements IConversation { } } - generateTimestamps() { + public generateTimestamps(startDate?: Date, lengthMinutes: number = 60) { // generate a random date within the last 10 years - const date = new Date(); - const startHour = 14; - date.setFullYear(date.getFullYear() - Math.floor(Math.random() * 10)); - date.setMonth(Math.floor(Math.random() * 12)); - date.setDate(Math.floor(Math.random() * 28)); - const seconds = 3600; - let cumulativeLength = 0; - const cumulativeLengths = this.messages.map((msg) => { - const msgCum = cumulativeLength; - cumulativeLength += msg.textChunks[0].length; - return msgCum; - }); - for (let i = 0; i < this.messages.length; i++) { - const lengthPct = cumulativeLengths[i] / cumulativeLength; - const msgSeconds = lengthPct * seconds; - const minutes = Math.floor((msgSeconds % 3600) / 60); - const secs = Math.floor(msgSeconds % 60); - const timestamp = `${date.toISOString()}T${startHour}:${minutes}:${secs}`; - this.messages[i].timestamp = timestamp; - } + startDate ??= randomDate(); + timestampMessages( + this.messages, + startDate, + dateTime.addMinutesToDate(startDate, lengthMinutes), + ); } public async buildIndex( @@ -191,6 +179,7 @@ export class Podcast implements IConversation { ): Promise { const result = await buildConversationIndex(this, progressCallback); this.addMetadataToIndex(); + this.buildTimestampIndex(); return result; } @@ -212,6 +201,10 @@ export class Podcast implements IConversation { } } + public buildTimestampIndex(): void { + this.timestampIndex = new TimestampToMessageIndex(this.messages); + } + public serialize(): PodcastData { return { nameTag: this.nameTag, @@ -235,6 +228,7 @@ export class Podcast implements IConversation { data.relatedTermIndexData, ); } + this.buildTimestampIndex(); } } @@ -245,6 +239,8 @@ export interface PodcastData extends IConversationData { export async function importPodcast( transcriptFilePath: string, podcastName?: string, + startDate?: Date, + lengthMinutes: number = 60, ): Promise { const transcriptText = await readAllText(transcriptFilePath); podcastName ??= getFileName(transcriptFilePath); @@ -290,8 +286,56 @@ export async function importPodcast( } assignMessageListeners(msgs, participants); const pod = new Podcast(podcastName, msgs, [podcastName]); - // TODO: add timestamps and more tags + pod.generateTimestamps(startDate, lengthMinutes); + // TODO: add more tags // list all the books // what did K say about Children of Time? return pod; } + +/** + * Text (such as a transcript) can be collected over a time range. + * This text can be partitioned into blocks. However, timestamps for individual blocks are not available. + * Assigns individual timestamps to blocks proportional to their lengths. + * @param turns Transcript turns to assign timestamps to + * @param startDate starting + * @param endDate + */ +export function timestampMessages( + messages: IMessage[], + startDate: Date, + endDate: Date, +): void { + let startTicks = startDate.getTime(); + const ticksLength = endDate.getTime() - startTicks; + if (ticksLength <= 0) { + throw new Error(`${startDate} is not < ${endDate}`); + } + let messageLengths = messages.map((m) => messageLength(m)); + const textLength: number = messageLengths.reduce( + (total: number, l) => total + l, + 0, + ); + const ticksPerChar = ticksLength / textLength; + for (let i = 0; i < messages.length; ++i) { + messages[i].timestamp = new Date(startTicks).toISOString(); + // Now, we will 'elapse' time .. proportional to length of the text + // This assumes that each speaker speaks equally fast... + startTicks += ticksPerChar * messageLengths[i]; + } + + function messageLength(message: IMessage): number { + return message.textChunks.reduce( + (total: number, chunk) => total + chunk.length, + 0, + ); + } +} + +function randomDate(startHour = 14) { + const date = new Date(); + date.setFullYear(date.getFullYear() - Math.floor(Math.random() * 10)); + date.setMonth(Math.floor(Math.random() * 12)); + date.setDate(Math.floor(Math.random() * 28)); + return date; +} diff --git a/ts/packages/knowPro/src/query.ts b/ts/packages/knowPro/src/query.ts index 18abf0ccd..27d61c945 100644 --- a/ts/packages/knowPro/src/query.ts +++ b/ts/packages/knowPro/src/query.ts @@ -2,10 +2,10 @@ // Licensed under the MIT License. import { + DateRange, IConversation, IMessage, ITag, - ITermToRelatedTermsIndex, ITermToSemanticRefIndex, ITopic, KnowledgeType, @@ -23,7 +23,7 @@ import { SemanticRefAccumulator, TextRangeAccumulator, } from "./accumulators.js"; -import { collections, dateTime } from "typeagent"; +import { collections } from "typeagent"; export function isConversationSearchable(conversation: IConversation): boolean { return ( @@ -62,21 +62,6 @@ export function timestampRangeForConversation( return undefined; } -/** - * Assumes messages are in timestamp order. - * @param conversation - */ -export function getMessagesInDateRange( - conversation: IConversation, - dateRange: DateRange, -): IMessage[] { - return collections.getInRange( - conversation.messages, - dateTime.timestampString(dateRange.start), - dateRange.end ? dateTime.timestampString(dateRange.end) : undefined, - (x, y) => x.localeCompare(y), - ); -} /** * Returns: * 0 if locations are equal @@ -109,20 +94,15 @@ export function isInTextRange( innerRange: TextRange, ): boolean { // outer start must be <= inner start - // inner end must be <= outerEnd + // inner end must be < outerEnd (which is exclusive) let cmpStart = compareTextLocation(outerRange.start, innerRange.start); let cmpEnd = compareTextLocation( innerRange.end ?? MaxTextLocation, outerRange.end ?? MaxTextLocation, ); - return cmpStart <= 0 && cmpEnd <= 0; + return cmpStart <= 0 && cmpEnd < 0; } -export type DateRange = { - start: Date; - end?: Date | undefined; -}; - export function compareDates(x: Date, y: Date): number { return x.getTime() - y.getTime(); } @@ -143,27 +123,14 @@ export interface IQueryOpExpr { } export class QueryEvalContext { - constructor(private conversation: IConversation) { + constructor(public conversation: IConversation) { if (!isConversationSearchable(conversation)) { throw new Error(`${conversation.nameTag} is not initialized`); } } - public get semanticRefIndex(): ITermToSemanticRefIndex { - this.conversation.messages; - return this.conversation.semanticRefIndex!; - } - - public get semanticRefs(): SemanticRef[] { - return this.conversation.semanticRefs!; - } - - public get relatedTermIndex(): ITermToRelatedTermsIndex | undefined { - return this.conversation.relatedTermsIndex; - } - public getSemanticRef(semanticRefIndex: SemanticRefIndex): SemanticRef { - return this.semanticRefs[semanticRefIndex]; + return this.conversation.semanticRefs![semanticRefIndex]; } public getMessageForRef(semanticRef: SemanticRef): IMessage { @@ -196,10 +163,12 @@ export class TermsMatchExpr implements IQueryOpExpr { ): Promise { const matchAccumulator: SemanticRefAccumulator = new SemanticRefAccumulator(); - const index = context.semanticRefIndex; - const terms = await this.terms.eval(context); - for (const queryTerm of terms) { - this.accumulateMatches(index, matchAccumulator, queryTerm); + const index = context.conversation.semanticRefIndex; + if (index !== undefined) { + const terms = await this.terms.eval(context); + for (const queryTerm of terms) { + this.accumulateMatches(index, matchAccumulator, queryTerm); + } } return Promise.resolve(matchAccumulator); } @@ -233,7 +202,7 @@ export class ResolveRelatedTermsExpr implements IQueryOpExpr { public async eval(context: QueryEvalContext): Promise { const terms = await this.terms.eval(context); - const index = context.relatedTermIndex; + const index = context.conversation.relatedTermsIndex; if (index !== undefined) { for (const queryTerm of terms) { if ( @@ -272,7 +241,7 @@ export class GroupByKnowledgeTypeExpr ): Promise> { const semanticRefMatches = await this.matches.eval(context); return semanticRefMatches.groupMatchesByKnowledgeType( - context.semanticRefs, + context.conversation.semanticRefs!, ); } } @@ -550,31 +519,38 @@ function matchTextOneOf( export class ScopeExpr implements IQueryOpExpr { constructor( public sourceExpr: IQueryOpExpr, - // Predicates that identity what is in scope + // Predicates that look at matched semantic refs to determine what is in scope public predicates: IQuerySemanticRefPredicate[], + public scopeExpr: IQueryOpExpr | undefined = undefined, ) {} public async eval( context: QueryEvalContext, ): Promise { let accumulator = await this.sourceExpr.eval(context); - const tagScope = new TextRangeAccumulator(); + const scope = new TextRangeAccumulator(); + if (this.scopeExpr !== undefined) { + const timeRanges = await this.scopeExpr.eval(context); + if (timeRanges !== undefined) { + scope.addRanges(timeRanges); + } + } for (const inScopeRef of accumulator.getSemanticRefs( - context.semanticRefs, + context.conversation.semanticRefs!, (sr) => this.evalPredicates( context, accumulator.queryTermMatches, - this.predicates, + this.predicates!, sr, ), )) { - tagScope.addRange(inScopeRef.range); + scope.addRange(inScopeRef.range); } - if (tagScope.size > 0) { + if (scope.size > 0) { accumulator = accumulator.selectInScope( - context.semanticRefs, - tagScope, + context.conversation.semanticRefs!, + scope, ); } return Promise.resolve(accumulator); @@ -594,3 +570,16 @@ export class ScopeExpr implements IQueryOpExpr { return false; } } + +export class TimestampScopeExpr implements IQueryOpExpr { + constructor(public dateRange: DateRange) {} + + public eval(context: QueryEvalContext): Promise { + const index = context.conversation.timestampIndex; + let textRanges: TextRange[] | undefined; + if (index !== undefined) { + textRanges = index.getTextRange(this.dateRange); + } + return Promise.resolve(textRanges ?? []); + } +} diff --git a/ts/packages/knowPro/src/search.ts b/ts/packages/knowPro/src/search.ts index 5a676d415..1456105f9 100644 --- a/ts/packages/knowPro/src/search.ts +++ b/ts/packages/knowPro/src/search.ts @@ -3,6 +3,7 @@ import { SemanticRefAccumulator } from "./accumulators.js"; import { + DateRange, IConversation, KnowledgeType, QueryTerm, @@ -19,6 +20,7 @@ export type SearchResult = { export type SearchFilter = { type?: KnowledgeType | undefined; propertiesToMatch?: Record; + dateRange?: DateRange; }; /** * Searches conversation for terms @@ -93,9 +95,7 @@ class SearchQueryBuilder { : queryTerms, ); // Always apply "tag match" scope... all text ranges that matched tags.. are in scope - termsMatchExpr = new q.ScopeExpr(termsMatchExpr, [ - new q.KnowledgeTypePredicate("tag"), - ]); + termsMatchExpr = this.compileScope(termsMatchExpr, filter?.dateRange); if (filter !== undefined) { // Where clause termsMatchExpr = new q.WhereSemanticRefExpr( @@ -106,6 +106,19 @@ class SearchQueryBuilder { return termsMatchExpr; } + private compileScope( + termsMatchExpr: q.IQueryOpExpr, + dateRange?: DateRange, + ): q.IQueryOpExpr { + // Always apply "tag match" scope... all text ranges that matched tags.. are in scope + termsMatchExpr = new q.ScopeExpr( + termsMatchExpr, + [new q.KnowledgeTypePredicate("tag")], + dateRange ? new q.TimestampScopeExpr(dateRange) : undefined, + ); + return termsMatchExpr; + } + private compileFilter( filter: SearchFilter, ): q.IQuerySemanticRefPredicate[] { diff --git a/ts/packages/knowPro/src/timestampIndex.ts b/ts/packages/knowPro/src/timestampIndex.ts new file mode 100644 index 000000000..dc9ba7560 --- /dev/null +++ b/ts/packages/knowPro/src/timestampIndex.ts @@ -0,0 +1,81 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { collections, dateTime } from "typeagent"; +import { + DateRange, + IMessage, + ITimestampToMessageIndex, + MessageIndex, + TextRange, +} from "./dataFormat.js"; + +export class TimestampToMessageIndex implements ITimestampToMessageIndex { + private messageIndex: TimestampedTextRange[]; + constructor(messages: IMessage[]) { + this.messageIndex = []; + for (let i = 0; i < messages.length; ++i) { + this.addMessage(messages[i], i); + } + this.messageIndex.sort(compareTimestampedRange); + } + + public getTextRange(dateRange: DateRange): TextRange[] { + const startAt = dateTime.timestampString(dateRange.start); + const stopAt = dateRange.end + ? dateTime.timestampString(dateRange.end) + : undefined; + const ranges: TimestampedTextRange[] = collections.getInRange( + this.messageIndex, + startAt, + stopAt, + compareTimestampedRange, + ); + return ranges.map((r) => r.range); + } + + private addMessage( + message: IMessage, + messageIndex: MessageIndex, + inOrder = false, + ): boolean { + if (!message.timestamp) { + return false; + } + const date = new Date(message.timestamp); + // This string is formatted to be searchable + const entry = this.makeTimestamped(date, messageIndex); + if (inOrder) { + collections.insertIntoSorted( + this.messageIndex, + entry, + compareTimestampedRange, + ); + } else { + this.messageIndex.push(entry); + } + return true; + } + + private makeTimestamped( + timestamp: Date, + messageIndex: MessageIndex, + ): TimestampedTextRange { + return { + range: { start: { messageIndex } }, + timestamp: dateTime.timestampString(timestamp, false), + }; + } +} + +type TimestampedTextRange = { + timestamp: string; + range: TextRange; +}; + +function compareTimestampedRange( + x: TimestampedTextRange, + y: TimestampedTextRange, +) { + return x.timestamp.localeCompare(y.timestamp); +}