Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: Introducing vector and text search #9345

Merged
merged 14 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
230 changes: 140 additions & 90 deletions .github/workflows/python-integration-tests.yml

Large diffs are not rendered by default.

12 changes: 10 additions & 2 deletions python/.cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@
"vectorizer",
"vectorstoremodel",
"vertexai",
"Weaviate"
"Weaviate",
"qdrant",
"huggingface",
"pytestmark",
"contoso",
"opentelemetry",
"SEMANTICKERNEL",
"OTEL",
"vectorizable"
]
}
}
2 changes: 1 addition & 1 deletion python/.pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ repos:
- id: mypy
files: ^python/semantic_kernel/
name: mypy
entry: uv run mypy -p semantic_kernel --config-file python/mypy.ini
entry: cd python && uv run mypy -p semantic_kernel --config-file mypy.ini
eavanvalkenburg marked this conversation as resolved.
Show resolved Hide resolved
language: system
types: [python]
pass_filenames: false
Expand Down
2 changes: 1 addition & 1 deletion python/.vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
"justMyCode": false
}
]
}
5 changes: 3 additions & 2 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ azure = [
"azure-cosmos ~= 4.7"
]
chroma = [
"chromadb >= 0.4,<0.6"
"chromadb >= 0.5,<0.6"
]
google = [
"google-cloud-aiplatform ~= 1.60",
Expand All @@ -79,7 +79,7 @@ milvus = [
"milvus >= 2.3,<2.3.8; platform_system != 'Windows'"
]
mistralai = [
"mistralai >= 0.4,< 2.0"
"mistralai >= 0.4,< 1.0"
eavanvalkenburg marked this conversation as resolved.
Show resolved Hide resolved
]
ollama = [
"ollama ~= 0.2"
Expand Down Expand Up @@ -140,6 +140,7 @@ environments = [

[tool.pytest.ini_options]
addopts = "-ra -q -r fEX"
asyncio_default_fixture_loop_scope = "function"

[tool.ruff]
line-length = 120
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) Microsoft. All rights reserved.


from typing import Annotated, Any

from pydantic import BaseModel

from semantic_kernel.connectors.ai.open_ai import OpenAIEmbeddingPromptExecutionSettings
from semantic_kernel.data import (
VectorStoreRecordDataField,
VectorStoreRecordKeyField,
VectorStoreRecordVectorField,
vectorstoremodel,
)

###
# The data model used for this sample is based on the hotel data model from the Azure AI Search samples.
# When deploying a new index in Azure AI Search using the import wizard you can choose to deploy the 'hotel-samples'
# dataset, see here: https://learn.microsoft.com/en-us/azure/search/search-get-started-portal.
# This is the dataset used in this sample with some modifications.
# This model adds vectors for the 2 descriptions in English and French.
# Both are based on the 1536 dimensions of the OpenAI models.
# You can adjust this at creation time and then make the change below as well.
###


@vectorstoremodel
class HotelSampleClass(BaseModel):
hotel_id: Annotated[str, VectorStoreRecordKeyField]
hotel_name: Annotated[str | None, VectorStoreRecordDataField()] = None
description: Annotated[
str,
VectorStoreRecordDataField(
has_embedding=True, embedding_property_name="description_vector", is_full_text_searchable=True
),
]
description_vector: Annotated[
list[float] | None,
VectorStoreRecordVectorField(
dimensions=1536,
local_embedding=True,
embedding_settings={"embedding": OpenAIEmbeddingPromptExecutionSettings(dimensions=1536)},
),
] = None
description_fr: Annotated[
str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="description_fr_vector")
]
description_fr_vector: Annotated[
list[float] | None,
VectorStoreRecordVectorField(
dimensions=1536,
local_embedding=True,
embedding_settings={"embedding": OpenAIEmbeddingPromptExecutionSettings(dimensions=1536)},
),
] = None
category: Annotated[str, VectorStoreRecordDataField()]
tags: Annotated[list[str], VectorStoreRecordDataField()]
parking_included: Annotated[bool | None, VectorStoreRecordDataField()] = None
last_renovation_date: Annotated[str | None, VectorStoreRecordDataField()] = None
rating: Annotated[float, VectorStoreRecordDataField()]
location: Annotated[dict[str, Any], VectorStoreRecordDataField()]
address: Annotated[dict[str, str | None], VectorStoreRecordDataField()]
rooms: Annotated[list[dict[str, Any]], VectorStoreRecordDataField()]
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio

###
# The data model used for this sample is based on the hotel data model from the Azure AI Search samples.
# When deploying a new index in Azure AI Search using the import wizard you can choose to deploy the 'hotel-samples'
# dataset, see here: https://learn.microsoft.com/en-us/azure/search/search-get-started-portal.
# This is the dataset used in this sample with some modifications.
# This model adds vectors for the 2 descriptions in English and French.
# Both are based on the 1536 dimensions of the OpenAI models.
# You can adjust this at creation time and then make the change below as well.
# This sample assumes the index is deployed, the vector fields can be empty.
# If the vector fields are empty, change the first_run parameter to True to add the vectors.
###
from step_0_data_model import HotelSampleClass

from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.open_ai import OpenAITextEmbedding
from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
from semantic_kernel.data import (
VectorSearchOptions,
VectorStoreRecordUtils,
)

first_run = False


async def add_vectors(collection: AzureAISearchCollection, vectorizer: VectorStoreRecordUtils):
"""This is a simple function that uses the VectorStoreRecordUtils to add vectors to the records in the collection.

It first uses the search_client within the collection to get a list of ids.
and then uses the upsert to add the vectors to the records.
"""
ids: list[str] = [res.get("hotel_id") async for res in await collection.search_client.search(select="hotel_id")]
print("sample id:", ids[0])

hotels = await collection.get_batch(ids)
if hotels is not None and isinstance(hotels, list):
for hotel in hotels:
if not hotel.description_vector or not hotel.description_fr_vector:
hotel = await vectorizer.add_vector_to_records(hotel, HotelSampleClass)
await collection.upsert(hotel)


async def main(query: str, first_run: bool = False):
# Create the kernel
kernel = Kernel()
# Add the OpenAI text embedding service
embeddings = OpenAITextEmbedding(service_id="embedding", ai_model_id="text-embedding-3-small")
kernel.add_service(embeddings)
# Create the VectorStoreRecordUtils object
vectorizer = VectorStoreRecordUtils(kernel)
# Create the Azure AI Search collection
collection = AzureAISearchCollection[HotelSampleClass](
collection_name="hotels-sample-index", data_model_type=HotelSampleClass
)
# Check if the collection exists.
if not await collection.does_collection_exist():
raise ValueError(
"Collection does not exist, please create using the "
"Azure AI Search portal wizard -> Import Data -> Samples -> hotels-sample."
"During creation adopt the schema to add the description_vector and description_fr_vector fields."
"Then run this sample with `first_run=True` to add the vectors."
)

# If it is the first run and there are no vectors, add them.
if first_run:
await add_vectors(collection, vectorizer)

# Search using just text, by default this will search all the searchable text fields in the index.
results = await collection.text_search(search_text=query)
print("Search results using text: ")
async for result in results.results:
print(
f" {result.record.hotel_id} (in {result.record.address['city']}, "
f"{result.record.address['country']}): {result.record.description} (score: {result.score})"
)

print("\n")

# Generate the vector for the query
query_vector = (await embeddings.generate_raw_embeddings([query]))[0]

print("Search results using vector: ")
# Use vectorized search to search using the vector.
results = await collection.vectorized_search(
vector=query_vector,
options=VectorSearchOptions(vector_field_name="description_vector"),
)
async for result in results.results:
print(
f" {result.record.hotel_id} (in {result.record.address['city']}, "
f"{result.record.address['country']}): {result.record.description} (score: {result.score})"
)

# Delete the collection object so that the connection is closed.
del collection
await asyncio.sleep(2)


if __name__ == "__main__":
query = "swimming pool and good internet connection"
asyncio.run(main(query=query, first_run=first_run))
Loading
Loading