Merge branch 'main' into flightsIT

opensearch-project · Feb 13, 2025 · 5ad650e · 5ad650e
2 parents 35e2b51 + 4783f08
commit 5ad650e
Show file tree

Hide file tree

Showing 933 changed files with 22,375 additions and 171 deletions.
diff --git a/docker/integ-test/docker-compose.yml b/docker/integ-test/docker-compose.yml
@@ -44,6 +44,9 @@ services:
       - type: bind
         source: ../../$FLINT_JAR
         target: /opt/bitnami/spark/jars/flint-spark-integration.jar
+      - type: bind
+        source: ../../$SQL_APP_JAR
+        target: /opt/bitnami/spark/jars/opensearch-spark-sql-application.jar
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8080/"]
       interval: 1m
@@ -87,6 +90,9 @@ services:
       - type: bind
         source: ../../$FLINT_JAR
         target: /opt/bitnami/spark/jars/flint-spark-integration.jar
+      - type: bind
+        source: ../../$SQL_APP_JAR
+        target: /opt/bitnami/spark/jars/opensearch-spark-sql-application.jar
     networks:
       - opensearch-net
     depends_on:

diff --git a/docker/integ-test/spark-submit/Dockerfile b/docker/integ-test/spark-submit/Dockerfile
@@ -12,6 +12,7 @@ ENV SQL_APP_JAR $SQL_APP_JAR
 COPY docker/integ-test/spark/spark-defaults.conf /opt/bitnami/spark/conf/spark-defaults.conf
 COPY ${FLINT_JAR} /opt/bitnami/spark/jars/flint-spark-integration.jar
 COPY ${PPL_JAR} /opt/bitnami/spark/jars/ppl-spark-integration.jar
+COPY ${SQL_APP_JAR} /opt/bitnami/spark/jars/opensearch-spark-sql-application.jar
 
 USER root
 RUN mkdir /app

diff --git a/docs/docker/integ-test/README.md b/docs/docker/integ-test/README.md
@@ -169,6 +169,86 @@ Sample response:
 }
 ```
 
+## Adding Tests
+
+### Spark Tests
+
+|     | Query File Extension | Directory                                     |
+|-----|----------------------|-----------------------------------------------|
+| PPL | .ppl                 | e2e-test/src/test/resources/spark/queries/ppl |
+| SQL | .sql                 | e2e-test/src/test/resources/spark/queries/sql |
+
+A test consists of a query and the expected results. Choose a base filename. The query file uses the base filename
+with the extension above. The results file uses the base filename with a `.results` extension. The results file is
+in CSV format with a field headers line.
+
+Example: Adding a PPL Test
+
+1. Create a file named `e2e-test/src/test/resources/spark/queries/ppl/sample-test.ppl` with the following contents:
+   ```
+   source=my-table | fields x, y
+   ```
+2. Generate the expected results file named `e2e-test/src/test/resources/spark/queries/ppl/sample-test.results`
+   It is a CSV file with the field name header. For example:
+   ```
+   x,y
+   1,1
+   2,4
+   3,9
+   ```
+
+### Async API Tests
+
+|     | Query File Extension | Directory                                          |
+|-----|----------------------|----------------------------------------------------|
+| PPL | .ppl                 | e2e-test/src/test/resources/opensearch/queries/ppl |
+| SQL | .sql                 | e2e-test/src/test/resources/opensearch/queries/sql |
+
+A test consists of a query and the expected results. Choose a base filename. The query file uses the base filename
+with the extension above. The results file uses the base filename with a `.results` extension. The results file is
+in JSON format is the response when retrieving the results using the Async Query API.
+
+Example: Adding a PPL Test
+
+1. Create a file named `e2e-test/src/test/resources/opensearch/queries/ppl/sample-test.ppl` with the following
+   contents:
+   ```
+   source=my-table | fields x, y
+   ```
+2. Generate the expected results file named `e2e-test/src/test/resources/opensearch/queries/ppl/sample-test.results`
+   It is a JSON file. For example:
+   ```
+   {
+     "status": "SUCCESS",
+     "schema": [
+       {
+         "name": "x",
+         "type": "integer"
+       },
+       {
+         "name": "y",
+         "type": "integer"
+       }
+     ],
+     "datarows": [
+       [
+         1,
+         1
+       ],
+       [
+         2,
+         4
+       ],
+       [
+         3,
+         9
+       ]
+     ],
+     "total": 3,
+     "size": 3
+   }
+   ```
+
 ## Configuration of the Cluster
 
 There are several settings that can be adjusted for the cluster.

diff --git a/docs/index.md b/docs/index.md
@@ -543,6 +543,8 @@ In the index mapping, the `_meta` and `properties`field stores meta and schema i
 - `spark.datasource.flint.read.scroll_size`: default value is 100.
 - `spark.datasource.flint.read.scroll_duration`: default value is 5 minutes. scroll context keep alive duration.
 - `spark.datasource.flint.retry.max_retries`: max retries on failed HTTP request. default value is 3. Use 0 to disable retry.
+- `spark.datasource.flint.retry.bulk.max_retries`: max retries on failed bulk request. default value is 10. Use 0 to disable retry.
+- `spark.datasource.flint.retry.bulk.initial_backoff`: initial backoff in seconds for bulk request retry, default is 4.
 - `spark.datasource.flint.retry.http_status_codes`: retryable HTTP response status code list. default value is "429,502" (429 Too Many Request and 502 Bad Gateway).
 - `spark.datasource.flint.retry.exception_class_names`: retryable exception class name list. by default no retry on any exception thrown.
 - `spark.datasource.flint.read.support_shard`: default is true. set to false if index does not support shard (AWS OpenSearch Serverless collection). Do not use in production, this setting will be removed in later version. 

diff --git a/docs/opensearch-table.md b/docs/opensearch-table.md
@@ -47,6 +47,12 @@ Using a wildcard index name:
 val df = spark.sql("SELECT * FROM dev.default.`my_index*`")
 df.show()
 ```
+Join two indices
+```scala
+val df = spark.sql("SELECT * FROM dev.default.my_index as t1 JOIN dev.default.my_index as t2 ON t1.id == t2.id")
+df.show()
+```
+
 ## Data Types
 The following table defines the data type mapping between OpenSearch index field type and Spark data type.
 
@@ -70,20 +76,15 @@ The following table defines the data type mapping between OpenSearch index field
     * Map to DateType if format = strict_date, (we also support format = date, may change in future)
     * Map to TimestampType if format = strict_date_optional_time_nanos, (we also support format =
       strict_date_optional_time | epoch_millis, may change in future)
-* Spark data types VarcharType(length) and CharType(length) are both currently mapped to Flint data
-  type *keyword*, dropping their length property. On the other hand, Flint data type *keyword* only
-  maps to StringType.
+* Spark data types VarcharType(length) and CharType(length) are both currently mapped to OpenSearch 
+  data type *keyword*, dropping their length property. On the other hand, OpenSearch data type 
+  *keyword* only maps to StringType.
 * Spark data type MapType is mapped to an empty OpenSearch object. The inner fields then rely on
-  dynamic mapping. On the other hand, Flint data type *object* only maps to StructType.
-* Spark data type DecimalType is mapped to an OpenSearch double. On the other hand, Flint data type
-  *double* only maps to DoubleType.
+  dynamic mapping. On the other hand, OpenSearch data type *object* only maps to StructType.
+* Spark data type DecimalType is mapped to an OpenSearch double. On the other hand, OpenSearch data 
+  type *double* only maps to DoubleType.
 * OpenSearch alias fields allow alternative names for existing fields in the schema without duplicating data. They inherit the data type and nullability of the referenced field and resolve dynamically to the primary field in queries.
-
-Join two indices
-```scala
-val df = spark.sql("SELECT * FROM dev.default.my_index as t1 JOIN dev.default.my_index as t2 ON t1.id == t2.id")
-df.show()
-```
+* OpenSearch multi-fields on text field is supported. These multi-fields are stored as part of the field's metadata and cannot be directly selected. Instead, they are automatically utilized during the DSL query translation process.
 
 ## Limitation
 ### catalog operation

diff --git a/docs/ppl-lang/ppl-lookup-command.md b/docs/ppl-lang/ppl-lookup-command.md
@@ -29,17 +29,17 @@ LOOKUP <lookupIndex> (<lookupMappingField> [AS <sourceMappingField>])...
 **inputField**
 - Optional
 - Default: All fields of \<lookupIndex\> where matched values are applied to result output if no field is specified.
-- Description: A field in \<lookupIndex\> where matched values are applied to result output. You can specify multiple \<inputField\> with comma-delimited. If you don't specify any \<inputField\>, all fields of \<lookupIndex\> where matched values are applied to result output.
+- Description: A field in \<lookupIndex\> where matched values are applied to result output. You can specify multiple \<inputField\> with comma-delimited. If you don't specify any \<inputField\>, all fields expect \<lookupMappingField\> from \<lookupIndex\> where matched values are applied to result output.
 
 **outputField**
 - Optional
 - Default: \<inputField\>
-- Description:  A field of output. You can specify multiple \<outputField\>. If you specify \<outputField\> with an existing field name in source query, its values will be replaced or appended by matched values from \<inputField\>. If the field specified in \<outputField\> is a new field, an extended new field will be applied to the results.
+- Description:  A field of output. You can specify zero or multiple \<outputField\>. If you specify \<outputField\> with an existing field name in source query, its values will be replaced or appended by matched values from \<inputField\>. If the field specified in \<outputField\> is a new field, in REPLACE strategy, an extended new field will be applied to the results, but fail in APPEND strategy.
 
 **REPLACE | APPEND**
 - Optional
 - Default: REPLACE
-- Description: If you specify REPLACE, matched values in \<lookupIndex\> field overwrite the values in result. If you specify APPEND, matched values in \<lookupIndex\> field only append to the missing values in result.
+- Description: The output strategies. If you specify REPLACE, matched values in \<lookupIndex\> field overwrite the values in result. If you specify APPEND, matched values in \<lookupIndex\> field only append to the missing values in result.
 
 ### Usage
 - `LOOKUP <lookupIndex> id AS cid REPLACE mail AS email`

diff --git a/e2e-test/src/test/resources/opensearch/queries/ppl/query_001.ppl b/e2e-test/src/test/resources/opensearch/queries/ppl/query_001.ppl
@@ -0,0 +1 @@
+source = mys3.default.http_logs | dedup 1 status | fields @timestamp, clientip, status, size | head 10
diff --git a/e2e-test/src/test/resources/opensearch/queries/ppl/query_001.results b/e2e-test/src/test/resources/opensearch/queries/ppl/query_001.results
@@ -0,0 +1,37 @@
+{
+  "status": "SUCCESS",
+  "schema": [
+    {
+      "name": "@timestamp",
+      "type": "timestamp_ntz"
+    },
+    {
+      "name": "clientip",
+      "type": "string"
+    },
+    {
+      "name": "status",
+      "type": "long"
+    },
+    {
+      "name": "size",
+      "type": "long"
+    }
+  ],
+  "datarows": [
+    [
+      "2023-10-01T10:00:00.000",
+      "40.135.0.0",
+      200,
+      24736
+    ],
+    [
+      "2023-10-01T10:20:00.000",
+      "247.37.0.0",
+      304,
+      0
+    ]
+  ],
+  "total": 2,
+  "size": 2
+}
diff --git a/e2e-test/src/test/resources/opensearch/queries/ppl/query_002.ppl b/e2e-test/src/test/resources/opensearch/queries/ppl/query_002.ppl
@@ -0,0 +1 @@
+source = mys3.default.http_logs | dedup status, size | head 10
diff --git a/e2e-test/src/test/resources/opensearch/queries/ppl/query_002.results b/e2e-test/src/test/resources/opensearch/queries/ppl/query_002.results
@@ -0,0 +1,71 @@
+{
+  "status": "SUCCESS",
+  "schema": [
+    {
+      "name": "@timestamp",
+      "type": "timestamp_ntz"
+    },
+    {
+      "name": "year",
+      "type": "long"
+    },
+    {
+      "name": "month",
+      "type": "long"
+    },
+    {
+      "name": "day",
+      "type": "long"
+    },
+    {
+      "name": "clientip",
+      "type": "string"
+    },
+    {
+      "name": "request",
+      "type": "string"
+    },
+    {
+      "name": "status",
+      "type": "long"
+    },
+    {
+      "name": "size",
+      "type": "long"
+    }
+  ],
+  "datarows": [
+    [
+      "2023-10-01T10:15:00.000",
+      2023,
+      10,
+      1,
+      "247.37.0.0",
+      "GET /french/splash_inet.html HTTP/1.0",
+      200,
+      3781
+    ],
+    [
+      "2023-10-01T10:00:00.000",
+      2023,
+      10,
+      1,
+      "40.135.0.0",
+      "GET /images/hm_bg.jpg HTTP/1.0",
+      200,
+      24736
+    ],
+    [
+      "2023-10-01T10:20:00.000",
+      2023,
+      10,
+      1,
+      "247.37.0.0",
+      "GET /images/hm_nbg.jpg HTTP/1.0",
+      304,
+      0
+    ]
+  ],
+  "total": 3,
+  "size": 3
+}
diff --git a/e2e-test/src/test/resources/opensearch/queries/ppl/query_003.ppl b/e2e-test/src/test/resources/opensearch/queries/ppl/query_003.ppl
@@ -0,0 +1 @@
+source = mys3.default.http_logs | dedup 1 status keepempty=true | head 10
diff --git a/e2e-test/src/test/resources/opensearch/queries/ppl/query_003.results b/e2e-test/src/test/resources/opensearch/queries/ppl/query_003.results
@@ -0,0 +1,61 @@
+{
+  "status": "SUCCESS",
+  "schema": [
+    {
+      "name": "@timestamp",
+      "type": "timestamp_ntz"
+    },
+    {
+      "name": "year",
+      "type": "long"
+    },
+    {
+      "name": "month",
+      "type": "long"
+    },
+    {
+      "name": "day",
+      "type": "long"
+    },
+    {
+      "name": "clientip",
+      "type": "string"
+    },
+    {
+      "name": "request",
+      "type": "string"
+    },
+    {
+      "name": "status",
+      "type": "long"
+    },
+    {
+      "name": "size",
+      "type": "long"
+    }
+  ],
+  "datarows": [
+    [
+      "2023-10-01T10:00:00.000",
+      2023,
+      10,
+      1,
+      "40.135.0.0",
+      "GET /images/hm_bg.jpg HTTP/1.0",
+      200,
+      24736
+    ],
+    [
+      "2023-10-01T10:20:00.000",
+      2023,
+      10,
+      1,
+      "247.37.0.0",
+      "GET /images/hm_nbg.jpg HTTP/1.0",
+      304,
+      0
+    ]
+  ],
+  "total": 2,
+  "size": 2
+}
diff --git a/e2e-test/src/test/resources/opensearch/queries/ppl/query_004.ppl b/e2e-test/src/test/resources/opensearch/queries/ppl/query_004.ppl
@@ -0,0 +1 @@
+source = mys3.default.http_logs | dedup status, size keepempty=true | head 10
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		source = mys3.default.http_logs \| dedup 1 status \| fields @timestamp, clientip, status, size \| head 10