Skip to content

Commit

Permalink
Fix Chroma DB if multiple docs are inserted (#2144)
Browse files Browse the repository at this point in the history
## Description

ChromaDB did not insert any docs after the first one.

Fixes #2129
  • Loading branch information
dirkbrnd authored Feb 17, 2025
1 parent 3e8b4a5 commit 7450248
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 10 deletions.
7 changes: 6 additions & 1 deletion libs/agno/agno/embedder/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,12 @@ def client(self):
return self.gemini_client

def _response(self, text: str) -> EmbedContentResponse:
_request_params: Dict[str, Any] = {"contents": text, "model": self.id, "config": {}}
# If a user provides a model id with the `models/` prefix, we need to remove it
_id = self.id
if _id.startswith("models/"):
_id = _id.split("/")[-1]

_request_params: Dict[str, Any] = {"contents": text, "model": _id, "config": {}}
if self.dimensions:
_request_params["config"]["output_dimensionality"] = self.dimensions
if self.task_type:
Expand Down
24 changes: 15 additions & 9 deletions libs/agno/agno/vectordb/chroma/chromadb.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,20 @@ def doc_exists(self, document: Document) -> bool:
Returns:
bool: True if document exists, False otherwise.
"""
if self.client:
try:
collection: Collection = self.client.get_collection(name=self.collection_name)
collection_data: GetResult = collection.get(include=[IncludeEnum.documents])
if collection_data.get("documents") != []:
return True
except Exception as e:
logger.error(f"Document does not exist: {e}")
if not self.client:
logger.warning("Client not initialized")
return False

try:
collection: Collection = self.client.get_collection(name=self.collection_name)
collection_data: GetResult = collection.get(include=[IncludeEnum.documents])
existing_documents = collection_data.get("documents", [])
cleaned_content = document.content.replace("\x00", "\ufffd")
if cleaned_content in existing_documents: # type: ignore
return True
except Exception as e:
logger.error(f"Document does not exist: {e}")

return False

def name_exists(self, name: str) -> bool:
Expand Down Expand Up @@ -217,7 +223,7 @@ def search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] =
metadata = result.get("metadatas", [{}])[0] # type: ignore
documents = result.get("documents", [[]])[0] # type: ignore
embeddings = result.get("embeddings")[0] # type: ignore
embeddings = [e.tolist() if hasattr(e, "tolist") else e for e in embeddings]
embeddings = [e.tolist() if hasattr(e, "tolist") else e for e in embeddings] # type: ignore
distances = result.get("distances", [[]])[0] # type: ignore

for idx, distance in enumerate(distances):
Expand Down
18 changes: 18 additions & 0 deletions libs/agno/tests/unit/vectordb/test_chromadb.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,21 @@ def test_custom_embedder(mock_embedder):
finally:
if os.path.exists(TEST_PATH):
shutil.rmtree(TEST_PATH)


def test_multiple_document_operations(chroma_db, sample_documents):
"""Test multiple document operations including batch inserts"""
# Test batch insert
first_batch = sample_documents[:2]
chroma_db.insert(first_batch)
assert chroma_db.get_count() == 2

# Test adding another document
second_batch = [sample_documents[2]]
chroma_db.insert(second_batch)
assert chroma_db.get_count() == 3

# Verify all documents are searchable
curry_results = chroma_db.search("curry", limit=1)
assert len(curry_results) == 1
assert "curry" in curry_results[0].content.lower()

0 comments on commit 7450248

Please sign in to comment.