diff --git a/api/core/rag/docstore/dataset_docstore.py b/api/core/rag/docstore/dataset_docstore.py index 8b95d81cc1124b..e4d6e2a5692078 100644 --- a/api/core/rag/docstore/dataset_docstore.py +++ b/api/core/rag/docstore/dataset_docstore.py @@ -115,6 +115,7 @@ def add_documents(self, docs: Sequence[Document], allow_update: bool = True, sav tokens=tokens, enabled=False, created_by=self._user_id, + page_number=doc.metadata.get("page", 0), ) if doc.metadata.get("answer"): segment_document.answer = doc.metadata.pop("answer", "") diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py index e1d36aad1fa5d7..48cc37c34c20a5 100644 --- a/api/core/rag/retrieval/dataset_retrieval.py +++ b/api/core/rag/retrieval/dataset_retrieval.py @@ -212,6 +212,7 @@ def retrieve( source["word_count"] = segment.word_count source["segment_position"] = segment.position source["index_node_hash"] = segment.index_node_hash + source["page_number"] = segment.page_number if segment.answer: source["content"] = f"question:{segment.content} \nanswer:{segment.answer}" else: diff --git a/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py b/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py index a4afea4b9df429..41dc4b96d74665 100644 --- a/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py +++ b/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py @@ -130,6 +130,7 @@ def _run(self, query: str) -> str: source["word_count"] = segment.word_count source["segment_position"] = segment.position source["index_node_hash"] = segment.index_node_hash + source["page_number"] = segment.page_number if segment.answer: source["content"] = f"question:{segment.content} \nanswer:{segment.answer}" else: diff --git a/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py b/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py index 8d6e821f4c02e0..b52ceaefa18d79 100644 --- a/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py +++ b/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py @@ -179,6 +179,7 @@ def _run(self, query: str) -> str: source["word_count"] = segment.word_count source["segment_position"] = segment.position source["index_node_hash"] = segment.index_node_hash + source["page_number"] = segment.page_number if segment.answer: source["content"] = f"question:{segment.content} \nanswer:{segment.answer}" else: diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index 0f239af51ae79c..76839312c27190 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -240,6 +240,7 @@ def _fetch_dataset_retriever(self, node_data: KnowledgeRetrievalNodeData, query: "segment_word_count": segment.word_count, "segment_position": segment.position, "segment_index_node_hash": segment.index_node_hash, + "segment_page_number": segment.page_number, }, "title": document.name, } diff --git a/api/migrations/versions/2025_02_14_0657-179653aaaf88_add_page_number_column_to_document_.py b/api/migrations/versions/2025_02_14_0657-179653aaaf88_add_page_number_column_to_document_.py new file mode 100644 index 00000000000000..68eac82cbc7f76 --- /dev/null +++ b/api/migrations/versions/2025_02_14_0657-179653aaaf88_add_page_number_column_to_document_.py @@ -0,0 +1,33 @@ +"""Add page_number column to document_segment + +Revision ID: 179653aaaf88 +Revises: a91b476a53de +Create Date: 2025-02-14 06:57:54.547192 + +""" +from alembic import op +import models as models +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '179653aaaf88' +down_revision = 'a91b476a53de' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('document_segments', schema=None) as batch_op: + batch_op.add_column(sa.Column('page_number', sa.Integer(), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('document_segments', schema=None) as batch_op: + batch_op.drop_column('page_number') + + # ### end Alembic commands ### diff --git a/api/models/dataset.py b/api/models/dataset.py index 1cf3dc42fe8235..f358c955e68ac8 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -517,6 +517,7 @@ class DocumentSegment(db.Model): # type: ignore[name-defined] dataset_id = db.Column(StringUUID, nullable=False) document_id = db.Column(StringUUID, nullable=False) position: Mapped[int] + page_number = db.Column(db.Integer, nullable=True) content = db.Column(db.Text, nullable=False) answer = db.Column(db.Text, nullable=True) word_count = db.Column(db.Integer, nullable=False)