Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Embedding Model works, But CustomVectorizer returns HttpResponseError #300

Open
wally-jhsong opened this issue Feb 5, 2025 · 1 comment

Comments

@wally-jhsong
Copy link

When I create an Azure AI Search Index using CustomVectorizer and then perform a query, an HttpResponseError occurs.

CustomVectorizer used my custom E5 embedding model as an endpoint in Azure Machine Learning.

The log in Azure Machine Learning Endpoints returns 200 ok.

However, performing a search returns the following error:

HttpResponseError: () Could not vectorize the query because the vectorization endpoint response is invalid.
Code: 
Message: Could not vectorize the query because the vectorization endpoint response is invalid.
File <command-8161632147394724>, line 5
      1 from itertools import tee
      3 results, results_backup = tee(results)
----> 5 for i, r in enumerate(results):
      6     print(r)
      8 results_backup
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-a4d14cee-3610-4575-b83e-16f37fffb48a/lib/python3.12/site-packages/azure/search/documents/_generated/operations/_documents_operations.py:778, in DocumentsOperations.search_post(self, search_request, request_options, **kwargs)
    776     map_error(status_code=response.status_code, response=response, error_map=error_map)
    777     error = self._deserialize.failsafe_deserialize(_models.ErrorResponse, pipeline_response)
--> 778     raise HttpResponseError(response=response, model=error)
    780 deserialized = self._deserialize("SearchDocumentsResult", pipeline_response)
    782 if cls:

https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/custom-vectorizer/scripts/setup_search_service.py
My code written and executed by referring to the setup_search_service.py file is as follows:

from azure.search.documents import SearchClient
import json
from azure.search.documents.indexes import SearchIndexClient
from azure.core.pipeline.policies import HTTPPolicy
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticSearch,
    SemanticField,
    SemanticPrioritizedFields,
    SearchIndex,
    CustomVectorizer,
    CustomWebApiParameters
)

# Workaround required to use the preview SDK
class CustomVectorizerRewritePolicy(HTTPPolicy):
    def send(self, request):
        request.http_request.body = request.http_request.body.replace('customVectorizerParameters', 'customWebApiParameters')
        return self.next.send(request)

# Create a search index
# https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.indexes.searchindexclient?view=azure-python
index_client = SearchIndexClient(endpoint=endpoint, credential=credential, per_call_policies=[CustomVectorizerRewritePolicy()])

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String, filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

headers = {'Authorization':('Bearer '+ aml_endpoint_key)}

# Configure the vector search configuration
# HNSW: Hierarchical Navigable Small World algorithm
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        ),
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            # vectorizer_name="endpt-kt-embeddings-integrate"
            vectorizer="customVectorizer",
        ),
    ],
    vectorizers=[
        CustomVectorizer(
            name="customVectorizer",
            custom_web_api_parameters=CustomWebApiParameters(
                uri=aml_endpoint_url,
                http_headers=headers,
                http_method="POST",
            )
        )
    ]
)


# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search)
result = index_client.create_or_update_index(index)
print(f'{result.name} created')


# Upload some documents to the index
output_path = os.path.join("../data", "docVectors-kse-2.json")
with open(output_path, 'r') as file:
    documents = json.load(file)
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents")


#Perform a text similarity search
from azure.search.documents.models import VectorizableTextQuery

query = "tools for software development"
  
search_client = SearchClient(endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=2, fields="contentVector")

results = search_client.search(
    search_text=None,
    vector_queries= [vector_query],
    select=["title", "content", "category"],
    top=1,
    include_total_count=True,
)


from itertools import tee

results, results_backup = tee(results)

print("**Print All**")

# Error occur
for i, r in enumerate(results):
    print(r)


print("**Print Key Result**")
for result in results_backup:
    print(f"Title: {result['title']}")
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['content']}")
    print(f"Category: {result['category']}\n")

https://github.com/Azure-Samples/azure-search-power-skills/blob/main/Common/WebAPISkillContract.cs
The score.py(Embedding Model Endpoint) I wrote, referring to the WebAPISkillContract.cs file, is as follows:

def run(raw_data):

    logger.debug("raw_data: %s", raw_data)

    input_data = json.loads(raw_data)

    input_values = input_data["values"]

    output = dict()
    output["values"] = []

    for input_value in input_values:
        value_dic = dict()
        
        if "text" in input_value["data"]:

            value_dic["recordId"] = input_value["recordId"]
            value_dic["data"] = dict()

            try:
                text_ndarr = model.encode(EMBEDDING_FORMAT.format(prefix="UNUSED0002", text=input_value["data"]["text"]))
                value_dic["data"]["hitPositions"] = text_ndarr.tolist()
                value_dic["errors"] = None
                value_dic["warnings"] = None
            except Warning as w:
                logger.debug("warning to encode", w)
                value_dic["data"]["hitPositions"] = text_ndarr.tolist()
                value_dic["errors"] = None
                value_dic["warnings"]["message"] = str(w)
            except Exception as e:
                logger.debug("fail to encode", e)
                value_dic["errors"]["message"] = str(e)
                value_dic["warnings"] = None
            
            output["values"].append(value_dic)
    
    logger.debug("output: %s", output)

    return json.dumps(output)
@wally-jhsong
Copy link
Author

I tried changing the 'hitPositions' key to 'vector' because a 'vector' item must exist in the data object, but the same error occurs.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant