-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support multi-fields for text fields and improve push down logic
Signed-off-by: Peng Huo <[email protected]>
- Loading branch information
Showing
10 changed files
with
385 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
88 changes: 88 additions & 0 deletions
88
...-integration/src/main/scala/org/apache/spark/sql/flint/datatype/FlintMetadataHelper.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.apache.spark.sql.flint.datatype | ||
|
||
import org.apache.spark.sql.types.{MetadataBuilder, _} | ||
|
||
/** | ||
* Helper class for handling Flint metadata operations | ||
*/ | ||
object FlintMetadataHelper { | ||
// OpenSearch Mappings. https://opensearch.org/docs/latest/field-types/ | ||
val OS_TYPE_KEY = "osType" | ||
val FIELDS_NAMES_KEY = "fields" | ||
|
||
// OpenSearch field types. https://opensearch.org/docs/latest/field-types/supported-field-types/index/ | ||
val TEXT_TYPE = "text" | ||
val KEYWORD_TYPE = "keyword" | ||
|
||
/** | ||
* Check if the metadata indicates a text field | ||
*/ | ||
def isTextField(metadata: Metadata): Boolean = { | ||
metadata.contains(OS_TYPE_KEY) && metadata.getString(OS_TYPE_KEY) == TEXT_TYPE | ||
} | ||
|
||
/** | ||
* Add text field metadata to builder | ||
*/ | ||
def addTextFieldMetadata(builder: MetadataBuilder): MetadataBuilder = { | ||
builder.putString(OS_TYPE_KEY, TEXT_TYPE) | ||
} | ||
|
||
/** | ||
* Add multi-field metadata to the provided MetadataBuilder. | ||
* | ||
* This method groups the provided fields by their field type. For each field type, the | ||
* associated field names are collected into an array. These arrays are then stored in a nested | ||
* metadata object, with each field type as the key. The nested metadata is added to the main | ||
* metadata builder under the key FIELDS_NAMES_KEY. | ||
* | ||
* @param builder | ||
* the MetadataBuilder to update with multi-field metadata. | ||
* @param fields | ||
* a map where each key is a field name and the corresponding value is its field type. | ||
* @return | ||
* the updated MetadataBuilder containing the multi-field metadata. | ||
*/ | ||
def addMultiFieldMetadata( | ||
builder: MetadataBuilder, | ||
fields: Map[String, String]): MetadataBuilder = { | ||
val mb = new MetadataBuilder() | ||
fields | ||
.groupBy { case (_, fieldType) => fieldType } | ||
.foreach { case (fieldType, entries) => | ||
val fieldNames = entries.map { case (fieldName, _) => fieldName } | ||
mb.putStringArray(fieldType, fieldNames.toArray) | ||
} | ||
builder.putMetadata(FIELDS_NAMES_KEY, mb.build()) | ||
} | ||
|
||
/** | ||
* Retrieve the first subfield name of type KEYWORD_TYPE if available. | ||
* | ||
* This method checks whether the provided metadata contains multi-field metadata under the key | ||
* FIELDS_NAMES_KEY. It then looks for a group of subfields with the key equal to KEYWORD_TYPE. | ||
* If such a group exists, the first field name in the array is returned. | ||
* | ||
* @param metadata | ||
* the metadata from which to retrieve the keyword subfield. | ||
* @return | ||
* an Option containing the first keyword subfield name, if found; otherwise, None. | ||
*/ | ||
def getKeywordSubfield(metadata: Metadata): Option[String] = { | ||
if (metadata.contains(FIELDS_NAMES_KEY)) { | ||
val multiFieldMetadata = metadata.getMetadata(FIELDS_NAMES_KEY) | ||
if (multiFieldMetadata.contains(KEYWORD_TYPE)) { | ||
multiFieldMetadata.getStringArray(KEYWORD_TYPE).headOption | ||
} else { | ||
None | ||
} | ||
} else { | ||
None | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
86 changes: 86 additions & 0 deletions
86
...gration/src/test/scala/org/apache/spark/sql/flint/datatype/FlintMetadataHelperSuite.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.apache.spark.sql.flint.datatype | ||
|
||
import org.scalatest.matchers.should.Matchers | ||
|
||
import org.apache.spark.FlintSuite | ||
import org.apache.spark.sql.types._ | ||
|
||
class FlintMetadataHelperSuite extends FlintSuite with Matchers { | ||
|
||
test("isTextField returns true when osType is text") { | ||
val builder = new MetadataBuilder() | ||
.putString(FlintMetadataHelper.OS_TYPE_KEY, FlintMetadataHelper.TEXT_TYPE) | ||
val metadata: Metadata = builder.build() | ||
assert(FlintMetadataHelper.isTextField(metadata)) | ||
} | ||
|
||
test("isTextField returns false when osType is not text") { | ||
val builder = new MetadataBuilder().putString(FlintMetadataHelper.OS_TYPE_KEY, "non-text") | ||
val metadata: Metadata = builder.build() | ||
assert(!FlintMetadataHelper.isTextField(metadata)) | ||
} | ||
|
||
test("addTextFieldMetadata sets osType to text") { | ||
val builder = new MetadataBuilder() | ||
val updatedBuilder = FlintMetadataHelper.addTextFieldMetadata(builder) | ||
val metadata: Metadata = updatedBuilder.build() | ||
assert(metadata.getString(FlintMetadataHelper.OS_TYPE_KEY) == FlintMetadataHelper.TEXT_TYPE) | ||
} | ||
|
||
test("addMultiFieldMetadata groups fields by field type") { | ||
val builder = new MetadataBuilder() | ||
val fields = Map( | ||
"field1" -> FlintMetadataHelper.TEXT_TYPE, | ||
"field2" -> FlintMetadataHelper.KEYWORD_TYPE, | ||
"field3" -> FlintMetadataHelper.KEYWORD_TYPE) | ||
val updatedBuilder = FlintMetadataHelper.addMultiFieldMetadata(builder, fields) | ||
val metadata: Metadata = updatedBuilder.build() | ||
|
||
// Verify that multi-field metadata is added under FIELDS_NAMES_KEY. | ||
assert(metadata.contains(FlintMetadataHelper.FIELDS_NAMES_KEY)) | ||
val multiFieldMetadata: Metadata = metadata.getMetadata(FlintMetadataHelper.FIELDS_NAMES_KEY) | ||
|
||
// Verify text type field grouping. | ||
assert(multiFieldMetadata.contains(FlintMetadataHelper.TEXT_TYPE)) | ||
val textFields = multiFieldMetadata.getStringArray(FlintMetadataHelper.TEXT_TYPE) | ||
assert(textFields.sameElements(Array("field1"))) | ||
|
||
// Verify keyword type field grouping. | ||
assert(multiFieldMetadata.contains(FlintMetadataHelper.KEYWORD_TYPE)) | ||
val keywordFields = multiFieldMetadata.getStringArray(FlintMetadataHelper.KEYWORD_TYPE) | ||
// Since the order of grouping may vary, compare sorted arrays. | ||
assert(keywordFields.sorted.sameElements(Array("field2", "field3"))) | ||
} | ||
|
||
test("getKeywordSubfield returns the first keyword field if available") { | ||
val builder = new MetadataBuilder() | ||
val fields = Map( | ||
"field1" -> FlintMetadataHelper.TEXT_TYPE, | ||
"field2" -> FlintMetadataHelper.KEYWORD_TYPE, | ||
"field3" -> FlintMetadataHelper.KEYWORD_TYPE) | ||
val updatedBuilder = FlintMetadataHelper.addMultiFieldMetadata(builder, fields) | ||
val metadata: Metadata = updatedBuilder.build() | ||
|
||
// Retrieve keyword fields from the nested metadata. | ||
val multiFieldMetadata = metadata.getMetadata(FlintMetadataHelper.FIELDS_NAMES_KEY) | ||
val keywordFields = multiFieldMetadata.getStringArray(FlintMetadataHelper.KEYWORD_TYPE) | ||
|
||
// Expect the first keyword field. | ||
assert(FlintMetadataHelper.getKeywordSubfield(metadata) == keywordFields.headOption) | ||
} | ||
|
||
test("getKeywordSubfield returns None if no keyword field exists") { | ||
val builder = new MetadataBuilder() | ||
val fields = Map("field1" -> FlintMetadataHelper.TEXT_TYPE) | ||
val updatedBuilder = FlintMetadataHelper.addMultiFieldMetadata(builder, fields) | ||
val metadata: Metadata = updatedBuilder.build() | ||
|
||
// Since there is no keyword type, getKeywordSubfield should return None. | ||
assert(FlintMetadataHelper.getKeywordSubfield(metadata).isEmpty) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.