Fix Chroma DB if multiple docs are inserted (#2144)

## Description ChromaDB did not insert any docs after the first one. Fixes #2129
agno-agi · Feb 17, 2025 · 7450248 · 7450248
1 parent 3e8b4a5
commit 7450248
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 10 deletions.
diff --git a/libs/agno/agno/embedder/google.py b/libs/agno/agno/embedder/google.py
@@ -45,7 +45,12 @@ def client(self):
         return self.gemini_client
 
     def _response(self, text: str) -> EmbedContentResponse:
-        _request_params: Dict[str, Any] = {"contents": text, "model": self.id, "config": {}}
+        # If a user provides a model id with the `models/` prefix, we need to remove it
+        _id = self.id
+        if _id.startswith("models/"):
+            _id = _id.split("/")[-1]
+
+        _request_params: Dict[str, Any] = {"contents": text, "model": _id, "config": {}}
         if self.dimensions:
             _request_params["config"]["output_dimensionality"] = self.dimensions
         if self.task_type:

diff --git a/libs/agno/agno/vectordb/chroma/chromadb.py b/libs/agno/agno/vectordb/chroma/chromadb.py
@@ -90,14 +90,20 @@ def doc_exists(self, document: Document) -> bool:
         Returns:
             bool: True if document exists, False otherwise.
         """
-        if self.client:
-            try:
-                collection: Collection = self.client.get_collection(name=self.collection_name)
-                collection_data: GetResult = collection.get(include=[IncludeEnum.documents])
-                if collection_data.get("documents") != []:
-                    return True
-            except Exception as e:
-                logger.error(f"Document does not exist: {e}")
+        if not self.client:
+            logger.warning("Client not initialized")
+            return False
+
+        try:
+            collection: Collection = self.client.get_collection(name=self.collection_name)
+            collection_data: GetResult = collection.get(include=[IncludeEnum.documents])
+            existing_documents = collection_data.get("documents", [])
+            cleaned_content = document.content.replace("\x00", "\ufffd")
+            if cleaned_content in existing_documents:  # type: ignore
+                return True
+        except Exception as e:
+            logger.error(f"Document does not exist: {e}")
+
         return False
 
     def name_exists(self, name: str) -> bool:
@@ -217,7 +223,7 @@ def search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] =
         metadata = result.get("metadatas", [{}])[0]  # type: ignore
         documents = result.get("documents", [[]])[0]  # type: ignore
         embeddings = result.get("embeddings")[0]  # type: ignore
-        embeddings = [e.tolist() if hasattr(e, "tolist") else e for e in embeddings]
+        embeddings = [e.tolist() if hasattr(e, "tolist") else e for e in embeddings]  # type: ignore
         distances = result.get("distances", [[]])[0]  # type: ignore
 
         for idx, distance in enumerate(distances):

diff --git a/libs/agno/tests/unit/vectordb/test_chromadb.py b/libs/agno/tests/unit/vectordb/test_chromadb.py
@@ -163,3 +163,21 @@ def test_custom_embedder(mock_embedder):
     finally:
         if os.path.exists(TEST_PATH):
             shutil.rmtree(TEST_PATH)
+
+
+def test_multiple_document_operations(chroma_db, sample_documents):
+    """Test multiple document operations including batch inserts"""
+    # Test batch insert
+    first_batch = sample_documents[:2]
+    chroma_db.insert(first_batch)
+    assert chroma_db.get_count() == 2
+
+    # Test adding another document
+    second_batch = [sample_documents[2]]
+    chroma_db.insert(second_batch)
+    assert chroma_db.get_count() == 3
+
+    # Verify all documents are searchable
+    curry_results = chroma_db.search("curry", limit=1)
+    assert len(curry_results) == 1
+    assert "curry" in curry_results[0].content.lower()