diff --git a/docker/integ-test/.env b/docker/integ-test/.env
index cf73bdc89..7d8995956 100644
--- a/docker/integ-test/.env
+++ b/docker/integ-test/.env
@@ -5,9 +5,13 @@ MASTER_UI_PORT=8080
MASTER_PORT=7077
UI_PORT=4040
SPARK_CONNECT_PORT=15002
-PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar
-FLINT_JAR=../../flint-spark-integration/target/scala-2.12/flint-spark-integration-assembly-0.7.0-SNAPSHOT.jar
+PPL_JAR=./ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar
+FLINT_JAR=./flint-spark-integration/target/scala-2.12/flint-spark-integration-assembly-0.7.0-SNAPSHOT.jar
+SQL_APP_JAR=./spark-sql-application/target/scala-2.12/sql-job-assembly-0.7.0-SNAPSHOT.jar
OPENSEARCH_NODE_MEMORY=512m
OPENSEARCH_ADMIN_PASSWORD=C0rrecthorsebatterystaple.
OPENSEARCH_PORT=9200
+OPENSEARCH_PA_PORT=9600
OPENSEARCH_DASHBOARDS_PORT=5601
+S3_ACCESS_KEY=Vt7jnvi5BICr1rkfsheT
+S3_SECRET_KEY=5NK3StGvoGCLUWvbaGN0LBUf9N6sjE94PEzLdqwO
diff --git a/docker/integ-test/configuration-updater/apply-configuration.sh b/docker/integ-test/configuration-updater/apply-configuration.sh
new file mode 100644
index 000000000..7946c75cd
--- /dev/null
+++ b/docker/integ-test/configuration-updater/apply-configuration.sh
@@ -0,0 +1,73 @@
+/bin/sh
+
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+
+# Login to Minio
+curl -q \
+ -c /tmp/minio-cookies.txt \
+ -H 'Content-Type: application/json' \
+ -d '{"accessKey": "minioadmin", "secretKey": "minioadmin"}' \
+ http://minio-S3:9001/api/v1/login
+# Delete the test bucket
+curl -b /tmp/minio-cookies.txt \
+ -X DELETE \
+ http://minio-S3:9001/api/v1/buckets/test
+# Create the integ-test bucket
+curl -q \
+ -b /tmp/minio-cookies.txt \
+ -X POST \
+ -H 'Content-Type: application/json' \
+ -d '{"name": "integ-test", "versioning": {"enabled": true, "excludePrefixes": [], "excludeFolders": false}, "locking": true}' \
+ http://minio-S3:9001/api/v1/buckets
+# Create the access key
+curl -q \
+ -b /tmp/minio-cookies.txt \
+ -X POST \
+ -H 'Content-Type: application/json' \
+ -d "{\"policy\": \"\", \"accessKey\": \"${S3_ACCESS_KEY}\", \"secretKey\": \"${S3_SECRET_KEY}\", \"description\": \"\", \"comment\": \"\", \"name\": \"\", \"expiry\": null}" \
+ http://minio-S3:9001/api/v1/service-account-credentials
+
+# Login to OpenSearch Dashboards
+echo ">>> Login to OpenSearch dashboards"
+curl -q \
+ -c /tmp/opensearch-cookies.txt \
+ -X POST \
+ -H 'Content-Type: application/json' \
+ -H 'Osd-Version: 2.18.0' \
+ -H 'Osd-Xsrf: fetch' \
+ -d "{\"username\": \"admin\", \"password\": \"${OPENSEARCH_ADMIN_PASSWORD}\"}" \
+ 'http://opensearch-dashboards:5601/auth/login?dataSourceId='
+if [ "$?" -eq "0" ]; then
+ echo " >>> Login successful"
+else
+ echo " >>> Login failed"
+fi
+# Create the S3/Glue datasource
+echo ">>> Creating datasource"
+curl -q \
+ -b /tmp/opensearch-cookies.txt \
+ -X POST \
+ -H 'Content-Type: application/json' \
+ -H 'Osd-Version: 2.18.0' \
+ -H 'Osd-Xsrf: fetch' \
+ -d "{\"name\": \"mys3\", \"allowedRoles\": [], \"connector\": \"s3glue\", \"properties\": {\"glue.auth.type\": \"iam_role\", \"glue.auth.role_arn\": \"arn:aws:iam::123456789012:role/S3Access\", \"glue.indexstore.opensearch.uri\": \"http://opensearch:9200\", \"glue.indexstore.opensearch.auth\": \"basicauth\", \"glue.indexstore.opensearch.auth.username\": \"admin\", \"glue.indexstore.opensearch.auth.password\": \"${OPENSEARCH_ADMIN_PASSWORD}\"}}" \
+ http://opensearch-dashboards:5601/api/directquery/dataconnections
+if [ "$?" -eq "0" ]; then
+ echo " >>> S3 datasource created"
+else
+ echo " >>> Failed to create S3 datasource"
+fi
+
+echo ">>> Setting cluster settings"
+curl -v \
+ -u "admin:${OPENSEARCH_ADMIN_PASSWORD}" \
+ -X PUT \
+ -H 'Content-Type: application/json' \
+ -d '{"persistent": {"plugins.query.executionengine.spark.config": "{\"applicationId\":\"integ-test\",\"executionRoleARN\":\"arn:aws:iam::xxxxx:role/emr-job-execution-role\",\"region\":\"us-west-2\", \"sparkSubmitParameters\": \"--conf spark.dynamicAllocation.enabled=false\"}"}}' \
+ http://opensearch:9200/_cluster/settings
+if [ "$?" -eq "0" ]; then
+ echo " >>> Successfully set cluster settings"
+else
+ echo " >>> Failed to set cluster settings"
+fi
diff --git a/docker/integ-test/docker-compose.yml b/docker/integ-test/docker-compose.yml
index c5ee53d7d..9fe79dc22 100644
--- a/docker/integ-test/docker-compose.yml
+++ b/docker/integ-test/docker-compose.yml
@@ -1,13 +1,35 @@
services:
+ metastore:
+ build: ./metastore
+ container_name: metastore
+ ports:
+ - "${THRIFT_PORT:-9083}:9083"
+ volumes:
+ - type: bind
+ source: ./metastore/hive-site.xml
+ target: /opt/apache-hive-2.3.9-bin/conf/hive-site.xml
+ - type: bind
+ source: ./metastore/hive-log4j2.properties
+ target: /opt/apache-hive-2.3.9-bin/conf/hive-log4j2.properties
+ - type: volume
+ source: metastore-data
+ target: /data
+ networks:
+ - opensearch-net
+
spark:
- image: bitnami/spark:${SPARK_VERSION:-3.5.3}
+ build:
+ context: ./spark
+ dockerfile: Dockerfile
+ args:
+ SPARK_VERSION: ${SPARK_VERSION:-3.5.3}
container_name: spark
+ entrypoint: /opt/bitnami/scripts/spark/spark-master-entrypoint.sh
ports:
- "${MASTER_UI_PORT:-8080}:8080"
- "${MASTER_PORT:-7077}:7077"
- "${UI_PORT:-4040}:4040"
- "${SPARK_CONNECT_PORT}:15002"
- entrypoint: /opt/bitnami/scripts/spark/master-entrypoint.sh
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
@@ -17,19 +39,10 @@ services:
- SPARK_PUBLIC_DNS=localhost
volumes:
- type: bind
- source: ./spark-master-entrypoint.sh
- target: /opt/bitnami/scripts/spark/master-entrypoint.sh
- - type: bind
- source: ./spark-defaults.conf
- target: /opt/bitnami/spark/conf/spark-defaults.conf
- - type: bind
- source: ./log4j2.properties
- target: /opt/bitnami/spark/conf/log4j2.properties
- - type: bind
- source: $PPL_JAR
+ source: ../../$PPL_JAR
target: /opt/bitnami/spark/jars/ppl-spark-integration.jar
- type: bind
- source: $FLINT_JAR
+ source: ../../$FLINT_JAR
target: /opt/bitnami/spark/jars/flint-spark-integration.jar
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/"]
@@ -40,9 +53,22 @@ services:
start_interval: 5s
networks:
- opensearch-net
+ depends_on:
+ metastore:
+ condition: service_started
+ opensearch:
+ condition: service_healthy
+ opensearch-dashboards:
+ condition: service_healthy
+ configuration-updater:
+ condition: service_completed_successfully
spark-worker:
- image: bitnami/spark:${SPARK_VERSION:-3.5.3}
+ build:
+ context: ./spark
+ dockerfile: Dockerfile
+ args:
+ SPARK_VERSION: ${SPARK_VERSION:-3.5.3}
container_name: spark-worker
environment:
- SPARK_MODE=worker
@@ -56,32 +82,43 @@ services:
- SPARK_PUBLIC_DNS=localhost
volumes:
- type: bind
- source: ./spark-defaults.conf
- target: /opt/bitnami/spark/conf/spark-defaults.conf
- - type: bind
- source: ./log4j2.properties
- target: /opt/bitnami/spark/conf/log4j2.properties
- - type: bind
- source: $PPL_JAR
+ source: ../../$PPL_JAR
target: /opt/bitnami/spark/jars/ppl-spark-integration.jar
- type: bind
- source: $FLINT_JAR
+ source: ../../$FLINT_JAR
target: /opt/bitnami/spark/jars/flint-spark-integration.jar
networks:
- opensearch-net
depends_on:
- - spark
+ metastore:
+ condition: service_started
+ spark:
+ condition: service_healthy
+
+ spark-submit:
+ build:
+ context: ../../
+ dockerfile: docker/integ-test/spark-submit/Dockerfile
+ args:
+ FLINT_JAR: ${FLINT_JAR}
+ PPL_JAR: ${PPL_JAR}
+ SQL_APP_JAR: ${SQL_APP_JAR}
+ depends_on:
+ metastore:
+ condition: service_completed_successfully
opensearch:
- image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-latest}
+ build: ./opensearch
container_name: opensearch
environment:
- cluster.name=opensearch-cluster
- node.name=opensearch
- - discovery.seed_hosts=opensearch
- - cluster.initial_cluster_manager_nodes=opensearch
+ - discovery.type=single-node
- bootstrap.memory_lock=true
+ - plugins.security.system_indices.enabled=false
+ - plugins.security.system_indices.permission.enabled=false
- plugins.security.ssl.http.enabled=false
+ - plugins.query.datasources.encryption.masterkey=9a515c99d4313f140a6607053502f4d6
- OPENSEARCH_JAVA_OPTS=-Xms${OPENSEARCH_NODE_MEMORY:-512m} -Xmx${OPENSEARCH_NODE_MEMORY:-512m}
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_ADMIN_PASSWORD}
ulimits:
@@ -92,12 +129,18 @@ services:
soft: 65536
hard: 65536
volumes:
- - opensearch-data:/usr/share/opensearch/data
+ - type: volume
+ source: opensearch-data
+ target: /usr/share/opensearch/data
+ - type: bind
+ source: /var/run/docker.sock
+ target: /var/run/docker.sock
ports:
- ${OPENSEARCH_PORT:-9200}:9200
- - 9600:9600
+ - ${OPENSEARCH_PA_PORT:-9600}:9600
expose:
- "${OPENSEARCH_PORT:-9200}"
+ - "9300"
healthcheck:
test: ["CMD", "curl", "-f", "-u", "admin:${OPENSEARCH_ADMIN_PASSWORD}", "http://localhost:9200/_cluster/health"]
interval: 1m
@@ -107,6 +150,9 @@ services:
start_interval: 5s
networks:
- opensearch-net
+ depends_on:
+ minio:
+ condition: service_healthy
opensearch-dashboards:
image: opensearchproject/opensearch-dashboards:${DASHBOARDS_VERSION}
@@ -119,8 +165,16 @@ services:
OPENSEARCH_HOSTS: '["http://opensearch:9200"]'
networks:
- opensearch-net
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:5601/"]
+ interval: 1m
+ timeout: 5s
+ retries: 3
+ start_period: 30s
+ start_interval: 5s
depends_on:
- - opensearch
+ opensearch:
+ condition: service_healthy
minio:
image: minio/minio
@@ -132,12 +186,37 @@ services:
- "9001:9001"
volumes:
- minio-data:/data
+ healthcheck:
+ test: ["CMD", "curl", "-q", "-f", "http://localhost:9000/minio/health/live"]
+ interval: 1m
+ timeout: 5s
+ retries: 3
+ start_period: 30s
+ start_interval: 5s
+ networks:
+ - opensearch-net
+
+ configuration-updater:
+ image: alpine/curl:latest
+ entrypoint: /bin/sh
+ command: /apply-configuration.sh
+ environment:
+ - S3_ACCESS_KEY=${S3_ACCESS_KEY}
+ - S3_SECRET_KEY=${S3_SECRET_KEY}
+ - OPENSEARCH_ADMIN_PASSWORD=${OPENSEARCH_ADMIN_PASSWORD}
+ volumes:
+ - type: bind
+ source: configuration-updater/apply-configuration.sh
+ target: /apply-configuration.sh
+ depends_on:
+ opensearch-dashboards:
+ condition: service_healthy
networks:
- opensearch-net
volumes:
+ metastore-data:
opensearch-data:
minio-data:
-
networks:
opensearch-net:
diff --git a/docker/integ-test/metastore/Dockerfile b/docker/integ-test/metastore/Dockerfile
new file mode 100644
index 000000000..79a7d725c
--- /dev/null
+++ b/docker/integ-test/metastore/Dockerfile
@@ -0,0 +1,29 @@
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+
+FROM openjdk:21-jdk-bookworm
+
+WORKDIR /opt
+
+ENV HADOOP_HOME=/opt/hadoop-3.3.4
+ENV HIVE_HOME=/opt/apache-hive-2.3.9-bin
+
+#RUN apt-get update
+RUN curl -L https://archive.apache.org/dist/hive/hive-2.3.9/apache-hive-2.3.9-bin.tar.gz | tar zxf -
+RUN curl -L https://archive.apache.org/dist/hadoop/common/hadoop-3.3.4/hadoop-3.3.4.tar.gz | tar zxf -
+RUN cp $HADOOP_HOME/share/hadoop/client/hadoop-client-api-3.3.4.jar $HIVE_HOME/lib/
+RUN cp $HADOOP_HOME/share/hadoop/client/hadoop-client-runtime-3.3.4.jar $HIVE_HOME/lib/
+RUN cp $HADOOP_HOME/share/hadoop/tools/lib/hadoop-aws-3.3.4.jar $HIVE_HOME/lib/
+RUN cp $HADOOP_HOME/share/hadoop/tools/lib/aws-java-sdk-bundle-1.12.262.jar $HIVE_HOME/lib/
+
+RUN groupadd -f -r hive --gid=1000
+RUN useradd -r -g hive --uid=1000 -d ${HIVE_HOME} hive
+RUN chown hive:hive -R ${HIVE_HOME}
+
+RUN mkdir /data
+RUN chown hive:hive /data
+
+WORKDIR $HIVE_HOME
+EXPOSE 9083
+ENTRYPOINT ["/opt/apache-hive-2.3.9-bin/bin/hive", "--service", "metastore"]
+USER hive
diff --git a/docker/integ-test/metastore/hive-log4j2.properties b/docker/integ-test/metastore/hive-log4j2.properties
new file mode 100644
index 000000000..7e6a3b08e
--- /dev/null
+++ b/docker/integ-test/metastore/hive-log4j2.properties
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+status = INFO
+name = HiveLog4j2
+packages = org.apache.hadoop.hive.ql.log
+
+# list of properties
+property.hive.log.level = INFO
+property.hive.root.logger = console
+property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
+property.hive.log.file = hive.log
+property.hive.perflogger.log.level = INFO
+
+# list of all appenders
+appenders = console
+
+# console appender
+appender.console.type = Console
+appender.console.name = console
+appender.console.target = SYSTEM_ERR
+appender.console.layout.type = PatternLayout
+appender.console.layout.pattern = %d{ISO8601} %5p [%t] %c{2}: %m%n
+
+# list of all loggers
+loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX, PerfLogger
+
+logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
+logger.NIOServerCnxn.level = WARN
+
+logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
+logger.ClientCnxnSocketNIO.level = WARN
+
+logger.DataNucleus.name = DataNucleus
+logger.DataNucleus.level = ERROR
+
+logger.Datastore.name = Datastore
+logger.Datastore.level = ERROR
+
+logger.JPOX.name = JPOX
+logger.JPOX.level = ERROR
+
+logger.PerfLogger.name = org.apache.hadoop.hive.ql.log.PerfLogger
+logger.PerfLogger.level = ${sys:hive.perflogger.log.level}
+
+# root logger
+rootLogger.level = ${sys:hive.log.level}
+rootLogger.appenderRefs = root
+rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}
diff --git a/docker/integ-test/metastore/hive-site.xml b/docker/integ-test/metastore/hive-site.xml
new file mode 100644
index 000000000..e235306eb
--- /dev/null
+++ b/docker/integ-test/metastore/hive-site.xml
@@ -0,0 +1,53 @@
+
+
+
+
+ hive.metastore.schema.verification
+ false
+
+
+ hive.metastore.warehouse.dir
+ file:///tmp
+
+
+
+ fs.default.name
+ file:///tmp
+
+
+
+ javax.jdo.option.ConnectionURL
+ jdbc:derby:;databaseName=/data/metastore_db;create=true
+
+ javax.jdo.option.ConnectionDriverName
+ org.apache.derby.jdbc.EmbeddedDriver
+
+
+ datanucleus.schema.autoCreateTables
+ true
+
+
+ fs.s3a.impl
+ org.apache.hadoop.fs.s3a.S3AFileSystem
+
+
+ fs.s3a.path.style.access
+ true
+
+
+ fs.s3a.access.key
+ Vt7jnvi5BICr1rkfsheT
+
+
+ fs.s3a.secret.key
+ 5NK3StGvoGCLUWvbaGN0LBUf9N6sjE94PEzLdqwO
+
+
+ fs.s3a.endpoint
+ http://minio-S3:9000
+
+
+ fs.s3a.connection.ssl.enabled
+ false
+
+
diff --git a/docker/integ-test/opensearch/Dockerfile b/docker/integ-test/opensearch/Dockerfile
new file mode 100644
index 000000000..da042516e
--- /dev/null
+++ b/docker/integ-test/opensearch/Dockerfile
@@ -0,0 +1,44 @@
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+
+FROM opensearchproject/opensearch:latest
+
+USER root
+
+RUN mkdir /tmp/alter-emr-jar
+WORKDIR /tmp/alter-emr-jar
+
+ENV AWS_VERSION=1.12.651
+
+RUN curl -O -L https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-emrserverless/${AWS_VERSION}/aws-java-sdk-emrserverless-${AWS_VERSION}.jar
+RUN curl -O -L https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/${AWS_VERSION}/aws-java-sdk-core-${AWS_VERSION}.jar
+
+COPY emr-src /tmp/alter-emr-jar/emr-src
+WORKDIR /tmp/alter-emr-jar/emr-src
+RUN /usr/share/opensearch/jdk/bin/javac -cp ../aws-java-sdk-emrserverless-${AWS_VERSION}.jar:../aws-java-sdk-core-${AWS_VERSION}.jar com/amazonaws/services/emrserverless/AWSEMRServerlessClientBuilder.java org/opensearch/spark/emrserverless/DockerEMRServerlessClient.java
+RUN mkdir /tmp/alter-emr-jar/extracted
+WORKDIR /tmp/alter-emr-jar/extracted
+RUN /usr/share/opensearch/jdk/bin/jar -xf ../aws-java-sdk-emrserverless-${AWS_VERSION}.jar
+RUN cp ../emr-src/com/amazonaws/services/emrserverless/AWSEMRServerlessClientBuilder.class com/amazonaws/services/emrserverless/
+RUN mkdir -p org/opensearch/spark/emrserverless
+RUN cp ../emr-src/org/opensearch/spark/emrserverless/DockerEMRServerlessClient.class org/opensearch/spark/emrserverless/
+RUN /usr/share/opensearch/jdk/bin/jar -cfM /usr/share/opensearch/plugins/opensearch-sql/aws-java-sdk-emrserverless-*.jar META-INF/MANIFEST.MF *
+RUN chown opensearch:opensearch /usr/share/opensearch/plugins/opensearch-sql/aws-java-sdk-emrserverless-*.jar
+RUN rm -rf /tmp/alter-emr-jar
+
+RUN yum install -y docker util-linux
+
+COPY opensearch-docker-it-entrypoint.sh /usr/share/opensearch/opensearch-docker-it-entrypoint.sh
+COPY docker-command-runner.sh /usr/share/opensearch/docker-command-runner.sh
+COPY opensearch_security.policy /usr/share/opensearch/config/opensearch-performance-analyzer/opensearch_security.policy
+COPY log4j2.properties /usr/share/opensearch/config/log4j2.properties
+
+RUN chown opensearch:opensearch /usr/share/opensearch/config/opensearch-performance-analyzer/opensearch_security.policy
+RUN chown opensearch:opensearch /usr/share/opensearch/config/log4j2.properties
+
+WORKDIR /usr/share/opensearch
+ENTRYPOINT ["./opensearch-docker-it-entrypoint.sh"]
+CMD ["opensearch"]
+
+EXPOSE 9200
+EXPOSE 9300
diff --git a/docker/integ-test/opensearch/docker-command-runner.sh b/docker/integ-test/opensearch/docker-command-runner.sh
new file mode 100755
index 000000000..25c1927ca
--- /dev/null
+++ b/docker/integ-test/opensearch/docker-command-runner.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+
+function process_files {
+ for cmd_file in `ls -1`; do
+ echo "$cmd_file" | grep -q 'cmd$'
+ if [ "$?" -eq "0" ]; then
+ stdout_filename=$(echo $cmd_file | sed -e 's/cmd$/stdout/')
+ stderr_filename=$(echo $cmd_file | sed -e 's/cmd$/stderr/')
+ exit_code_filename=$(echo $cmd_file | sed -e 's/cmd/exitCode/')
+
+ /usr/bin/docker $(cat $cmd_file) > $stdout_filename 2> $stderr_filename
+ echo "$?" > $exit_code_filename
+
+ rm $cmd_file
+ fi
+ done
+}
+
+if [ ! -d '/tmp/docker' ]; then
+ mkdir /tmp/docker
+ chown opensearch:opensearch /tmp/docker
+fi
+
+cd /tmp/docker
+while true; do
+ process_files
+ sleep 1
+done
+
diff --git a/docker/integ-test/opensearch/emr-src/com/amazonaws/services/emrserverless/AWSEMRServerlessClientBuilder.java b/docker/integ-test/opensearch/emr-src/com/amazonaws/services/emrserverless/AWSEMRServerlessClientBuilder.java
new file mode 100644
index 000000000..4666b494f
--- /dev/null
+++ b/docker/integ-test/opensearch/emr-src/com/amazonaws/services/emrserverless/AWSEMRServerlessClientBuilder.java
@@ -0,0 +1,34 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package com.amazonaws.services.emrserverless;
+
+import com.amazonaws.ClientConfigurationFactory;
+import com.amazonaws.client.AwsSyncClientParams;
+import com.amazonaws.client.builder.AwsSyncClientBuilder;
+
+import org.opensearch.spark.emrserverless.DockerEMRServerlessClient;
+
+public class AWSEMRServerlessClientBuilder extends AwsSyncClientBuilder {
+ private static final ClientConfigurationFactory CLIENT_CONFIG_FACTORY = new ClientConfigurationFactory();
+
+ private AWSEMRServerlessClientBuilder() {
+ super(CLIENT_CONFIG_FACTORY);
+ }
+
+ public static AWSEMRServerlessClientBuilder standard() {
+ return new AWSEMRServerlessClientBuilder();
+ }
+
+ public static AWSEMRServerless defaultClient() {
+ return (AWSEMRServerless) standard().build();
+ }
+
+ protected AWSEMRServerless build(AwsSyncClientParams params) {
+ DockerEMRServerlessClient client = new DockerEMRServerlessClient(CLIENT_CONFIG_FACTORY.getConfig());
+ client.setServiceNameIntern("emr");
+ return client;
+ }
+}
diff --git a/docker/integ-test/opensearch/emr-src/org/opensearch/spark/emrserverless/DockerEMRServerlessClient.java b/docker/integ-test/opensearch/emr-src/org/opensearch/spark/emrserverless/DockerEMRServerlessClient.java
new file mode 100644
index 000000000..88cc448a9
--- /dev/null
+++ b/docker/integ-test/opensearch/emr-src/org/opensearch/spark/emrserverless/DockerEMRServerlessClient.java
@@ -0,0 +1,216 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.spark.emrserverless;
+
+import com.amazonaws.AmazonWebServiceClient;
+import com.amazonaws.AmazonWebServiceRequest;
+import com.amazonaws.ClientConfiguration;
+import com.amazonaws.ResponseMetadata;
+import com.amazonaws.services.emrserverless.AWSEMRServerless;
+import com.amazonaws.services.emrserverless.model.CancelJobRunRequest;
+import com.amazonaws.services.emrserverless.model.CancelJobRunResult;
+import com.amazonaws.services.emrserverless.model.CreateApplicationRequest;
+import com.amazonaws.services.emrserverless.model.CreateApplicationResult;
+import com.amazonaws.services.emrserverless.model.DeleteApplicationRequest;
+import com.amazonaws.services.emrserverless.model.DeleteApplicationResult;
+import com.amazonaws.services.emrserverless.model.GetApplicationRequest;
+import com.amazonaws.services.emrserverless.model.GetApplicationResult;
+import com.amazonaws.services.emrserverless.model.GetDashboardForJobRunRequest;
+import com.amazonaws.services.emrserverless.model.GetDashboardForJobRunResult;
+import com.amazonaws.services.emrserverless.model.GetJobRunRequest;
+import com.amazonaws.services.emrserverless.model.GetJobRunResult;
+import com.amazonaws.services.emrserverless.model.ListApplicationsRequest;
+import com.amazonaws.services.emrserverless.model.ListApplicationsResult;
+import com.amazonaws.services.emrserverless.model.ListJobRunsRequest;
+import com.amazonaws.services.emrserverless.model.ListJobRunsResult;
+import com.amazonaws.services.emrserverless.model.ListTagsForResourceRequest;
+import com.amazonaws.services.emrserverless.model.ListTagsForResourceResult;
+import com.amazonaws.services.emrserverless.model.StartApplicationRequest;
+import com.amazonaws.services.emrserverless.model.StartApplicationResult;
+import com.amazonaws.services.emrserverless.model.StartJobRunRequest;
+import com.amazonaws.services.emrserverless.model.StartJobRunResult;
+import com.amazonaws.services.emrserverless.model.StopApplicationRequest;
+import com.amazonaws.services.emrserverless.model.StopApplicationResult;
+import com.amazonaws.services.emrserverless.model.TagResourceRequest;
+import com.amazonaws.services.emrserverless.model.TagResourceResult;
+import com.amazonaws.services.emrserverless.model.UntagResourceRequest;
+import com.amazonaws.services.emrserverless.model.UntagResourceResult;
+import com.amazonaws.services.emrserverless.model.UpdateApplicationRequest;
+import com.amazonaws.services.emrserverless.model.UpdateApplicationResult;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+
+public class DockerEMRServerlessClient extends AmazonWebServiceClient implements AWSEMRServerless {
+ private static final AtomicInteger JOB_ID = new AtomicInteger(1);
+
+ public DockerEMRServerlessClient(ClientConfiguration clientConfiguration) {
+ super(clientConfiguration);
+ setEndpointPrefix("emr");
+ }
+
+ @Override
+ public CancelJobRunResult cancelJobRun(final CancelJobRunRequest cancelJobRunRequest) {
+ return null;
+ }
+
+ @Override
+ public CreateApplicationResult createApplication(final CreateApplicationRequest createApplicationRequest) {
+ return null;
+ }
+
+ @Override
+ public DeleteApplicationResult deleteApplication(final DeleteApplicationRequest deleteApplicationRequest) {
+ return null;
+ }
+
+ @Override
+ public GetApplicationResult getApplication(final GetApplicationRequest getApplicationRequest) {
+ return null;
+ }
+
+ @Override
+ public GetDashboardForJobRunResult getDashboardForJobRun(
+ final GetDashboardForJobRunRequest getDashboardForJobRunRequest) {
+ return null;
+ }
+
+ @Override
+ public GetJobRunResult getJobRun(final GetJobRunRequest getJobRunRequest) {
+ return null;
+ }
+
+ @Override
+ public ListApplicationsResult listApplications(final ListApplicationsRequest listApplicationsRequest) {
+ return null;
+ }
+
+ @Override
+ public ListJobRunsResult listJobRuns(final ListJobRunsRequest listJobRunsRequest) {
+ return null;
+ }
+
+ @Override
+ public ListTagsForResourceResult listTagsForResource(
+ final ListTagsForResourceRequest listTagsForResourceRequest) {
+ return null;
+ }
+
+ @Override
+ public StartApplicationResult startApplication(final StartApplicationRequest startApplicationRequest) {
+ return null;
+ }
+
+ @Override
+ public StartJobRunResult startJobRun(final StartJobRunRequest startJobRunRequest) {
+ String entryPoint = startJobRunRequest.getJobDriver().getSparkSubmit().getEntryPoint();
+ List entryPointArguments = startJobRunRequest.getJobDriver().getSparkSubmit().getEntryPointArguments();
+ String sparkSubmitParameters = startJobRunRequest.getJobDriver().getSparkSubmit().getSparkSubmitParameters();
+
+ final int jobId = JOB_ID.getAndIncrement();
+
+ List runContainerCmd = new ArrayList<>();
+ runContainerCmd.add("run");
+ runContainerCmd.add("-d");
+ runContainerCmd.add("--rm");
+ runContainerCmd.add("--env");
+ runContainerCmd.add("SERVERLESS_EMR_JOB_ID=" + jobId);
+ runContainerCmd.add("--network");
+ runContainerCmd.add("integ-test_opensearch-net");
+ runContainerCmd.add("integ-test-spark-submit:latest");
+ runContainerCmd.add("/opt/bitnami/spark/bin/spark-submit");
+ runContainerCmd.add("--deploy-mode");
+ runContainerCmd.add("client");
+ runContainerCmd.add("--exclude-packages");
+ runContainerCmd.add("org.opensearch:opensearch-spark-standalone_2.12,org.opensearch:opensearch-spark-sql-application_2.12,org.opensearch:opensearch-spark-ppl_2.12");
+ runContainerCmd.add("--master");
+ runContainerCmd.add("local[2]");
+
+ runContainerCmd.addAll(Arrays.asList(sparkSubmitParameters.split(" ")));
+ runContainerCmd.addAll(entryPointArguments);
+
+ final List cmd = runContainerCmd.stream().filter(s -> !s.isBlank()).collect(Collectors.toList());
+
+ for (int i = 1; i < cmd.size(); i++) {
+ if (cmd.get(i - 1).equals("--conf")) {
+ if (cmd.get(i).startsWith("spark.datasource.flint.customAWSCredentialsProvider=") ||
+ cmd.get(i).startsWith("spark.datasource.flint.") ||
+ cmd.get(i).startsWith("spark.hadoop.hive.metastore.client.factory.class=")) {
+ cmd.remove(i - 1);
+ cmd.remove(i - 1);
+ i -= 2;
+ } else if (cmd.get(i).startsWith("spark.emr-serverless.driverEnv.JAVA_HOME=")) {
+ cmd.set(i, "spark.emr-serverless.driverEnv.JAVA_HOME=/opt/bitnami/java");
+ } else if (cmd.get(i).startsWith("spark.executorEnv.JAVA_HOME=")) {
+ cmd.set(i, "spark.executorEnv.JAVA_HOME=/opt/bitnami/java");
+ }
+ }
+ }
+
+ cmd.add(cmd.size() - 1, "/app/spark-sql-application.jar");
+
+ System.out.println(">>> " + String.join(" ", cmd));
+
+ try {
+ File dockerDir = new File("/tmp/docker");
+ File cmdFile = File.createTempFile("docker_", ".cmd", dockerDir);
+ FileOutputStream fos = new FileOutputStream(cmdFile);
+ fos.write(String.join(" ", cmd).getBytes(StandardCharsets.UTF_8));
+ fos.close();
+
+ String cmdFilename = cmdFile.getName();
+ String filenameBase = cmdFilename.substring(7, cmdFilename.length() - 4);
+ File exitCodeFile = new File(dockerDir, "docker_" + filenameBase + ".exitCode");
+ for (int i = 0; i < 600 && !exitCodeFile.exists(); i++) {
+ Thread.sleep(100L);
+ }
+
+ if (exitCodeFile.exists()) {
+ StartJobRunResult startJobResult = new StartJobRunResult();
+ startJobResult.setApplicationId(startJobRunRequest.getApplicationId());
+ startJobResult.setArn("arn:aws:emr-containers:foo:123456789012:/virtualclusters/0/jobruns/" + jobId);
+ startJobResult.setJobRunId(Integer.toString(jobId));
+ return startJobResult;
+ }
+ } catch (IOException | InterruptedException e) {
+ e.printStackTrace();
+ }
+
+ return null;
+ }
+
+ @Override
+ public StopApplicationResult stopApplication(final StopApplicationRequest stopApplicationRequest) {
+ return null;
+ }
+
+ @Override
+ public TagResourceResult tagResource(final TagResourceRequest tagResourceRequest) {
+ return null;
+ }
+
+ @Override
+ public UntagResourceResult untagResource(final UntagResourceRequest untagResourceRequest) {
+ return null;
+ }
+
+ @Override
+ public UpdateApplicationResult updateApplication(final UpdateApplicationRequest updateApplicationRequest) {
+ return null;
+ }
+
+ @Override
+ public ResponseMetadata getCachedResponseMetadata(final AmazonWebServiceRequest amazonWebServiceRequest) {
+ return null;
+ }
+}
diff --git a/docker/integ-test/log4j2.properties b/docker/integ-test/opensearch/log4j2.properties
similarity index 100%
rename from docker/integ-test/log4j2.properties
rename to docker/integ-test/opensearch/log4j2.properties
diff --git a/docker/integ-test/opensearch/opensearch-docker-it-entrypoint.sh b/docker/integ-test/opensearch/opensearch-docker-it-entrypoint.sh
new file mode 100755
index 000000000..277f531a9
--- /dev/null
+++ b/docker/integ-test/opensearch/opensearch-docker-it-entrypoint.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+
+./docker-command-runner.sh &
+echo $! > /var/run/docker-command-runner.pid
+
+su opensearch ./opensearch-docker-entrypoint.sh "$@"
+
+kill -TERM `cat /var/run/docker-command-runner.pid`
diff --git a/docker/integ-test/opensearch/opensearch_security.policy b/docker/integ-test/opensearch/opensearch_security.policy
new file mode 100644
index 000000000..8e4de3246
--- /dev/null
+++ b/docker/integ-test/opensearch/opensearch_security.policy
@@ -0,0 +1,18 @@
+grant {
+ permission java.lang.management.ManagementPermission "control";
+ permission java.net.SocketPermission "localhost:9600","connect,resolve";
+ permission java.lang.RuntimePermission "getClassLoader";
+ permission java.io.FilePermission "/tmp/docker/-", "read,write,delete";
+};
+
+grant codebase "file:${java.home}/../lib/tools.jar" {
+ permission java.security.AllPermission;
+};
+
+grant codeBase "jrt:/jdk.attach" {
+ permission java.security.AllPermission;
+};
+
+grant codeBase "jrt:/jdk.internal.jvmstat" {
+ permission java.security.AllPermission;
+};
diff --git a/docker/integ-test/spark-submit/Dockerfile b/docker/integ-test/spark-submit/Dockerfile
new file mode 100644
index 000000000..d61caad52
--- /dev/null
+++ b/docker/integ-test/spark-submit/Dockerfile
@@ -0,0 +1,20 @@
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+
+FROM bitnami/spark:3.5.3
+ARG FLINT_JAR
+ARG PPL_JAR
+ARG SQL_APP_JAR
+ENV FLINT_JAR $FLINT_JAR
+ENV PPL_JAR $PPL_JAR
+ENV SQL_APP_JAR $SQL_APP_JAR
+
+COPY docker/integ-test/spark/spark-defaults.conf /opt/bitnami/spark/conf/spark-defaults.conf
+COPY ${FLINT_JAR} /opt/bitnami/spark/jars/flint-spark-integration.jar
+COPY ${PPL_JAR} /opt/bitnami/spark/jars/ppl-spark-integration.jar
+
+USER root
+RUN mkdir /app
+COPY ${SQL_APP_JAR} /app/spark-sql-application.jar
+
+USER 1001
diff --git a/docker/integ-test/spark-defaults.conf b/docker/integ-test/spark-submit/spark-defaults.conf
similarity index 74%
rename from docker/integ-test/spark-defaults.conf
rename to docker/integ-test/spark-submit/spark-defaults.conf
index 19b9e4ec1..acd49625b 100644
--- a/docker/integ-test/spark-defaults.conf
+++ b/docker/integ-test/spark-submit/spark-defaults.conf
@@ -25,11 +25,21 @@
# spark.serializer org.apache.spark.serializer.KryoSerializer
# spark.driver.memory 5g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+spark.sql.catalogImplementation hive
+spark.hadoop.hive.metastore.uris thrift://metastore:9083
spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions,org.opensearch.flint.spark.FlintSparkExtensions
spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog
+spark.sql.catalog.mys3 org.opensearch.sql.FlintDelegatingSessionCatalog
spark.datasource.flint.host opensearch
spark.datasource.flint.port 9200
spark.datasource.flint.scheme http
spark.datasource.flint.auth basic
spark.datasource.flint.auth.username admin
spark.datasource.flint.auth.password C0rrecthorsebatterystaple.
+spark.sql.warehouse.dir s3a://integ-test
+spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
+spark.hadoop.fs.s3a.path.style.access true
+spark.hadoop.fs.s3a.access.key Vt7jnvi5BICr1rkfsheT
+spark.hadoop.fs.s3a.secret.key 5NK3StGvoGCLUWvbaGN0LBUf9N6sjE94PEzLdqwO
+spark.hadoop.fs.s3a.endpoint http://minio-S3:9000
+spark.hadoop.fs.s3a.connection.ssl.enabled false
\ No newline at end of file
diff --git a/docker/integ-test/spark/Dockerfile b/docker/integ-test/spark/Dockerfile
new file mode 100644
index 000000000..f180320a6
--- /dev/null
+++ b/docker/integ-test/spark/Dockerfile
@@ -0,0 +1,14 @@
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+
+ARG SPARK_VERSION=3.5.3
+FROM bitnami/spark:${SPARK_VERSION}
+
+USER root
+RUN apt update
+RUN apt install -y curl
+
+USER 1001
+COPY ./spark-defaults.conf /opt/bitnami/spark/conf/spark-defaults.conf
+COPY ./log4j2.properties /opt/bitnami/spark/conf/log4j2.properties
+COPY ./spark-master-entrypoint.sh /opt/bitnami/scripts/spark/spark-master-entrypoint.sh
diff --git a/docker/integ-test/spark/log4j2.properties b/docker/integ-test/spark/log4j2.properties
new file mode 100644
index 000000000..ab96e03ba
--- /dev/null
+++ b/docker/integ-test/spark/log4j2.properties
@@ -0,0 +1,69 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+rootLogger.level = info
+rootLogger.appenderRef.stdout.ref = console
+
+# In the pattern layout configuration below, we specify an explicit `%ex` conversion
+# pattern for logging Throwables. If this was omitted, then (by default) Log4J would
+# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional
+# class packaging information. That extra information can sometimes add a substantial
+# performance overhead, so we disable it in our default logging config.
+# For more information, see SPARK-39361.
+appender.console.type = Console
+appender.console.name = console
+appender.console.target = SYSTEM_ERR
+appender.console.layout.type = PatternLayout
+appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex
+
+# Set the default spark-shell/spark-sql log level to WARN. When running the
+# spark-shell/spark-sql, the log level for these classes is used to overwrite
+# the root logger's log level, so that the user can have different defaults
+# for the shell and regular Spark apps.
+logger.repl.name = org.apache.spark.repl.Main
+logger.repl.level = warn
+
+logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver
+logger.thriftserver.level = warn
+
+# Settings to quiet third party logs that are too verbose
+logger.jetty1.name = org.sparkproject.jetty
+logger.jetty1.level = warn
+logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle
+logger.jetty2.level = error
+logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper
+logger.replexprTyper.level = info
+logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter
+logger.replSparkILoopInterpreter.level = info
+logger.parquet1.name = org.apache.parquet
+logger.parquet1.level = error
+logger.parquet2.name = parquet
+logger.parquet2.level = error
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler
+logger.RetryingHMSHandler.level = fatal
+logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry
+logger.FunctionRegistry.level = error
+
+# For deploying Spark ThriftServer
+# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805
+appender.console.filter.1.type = RegexFilter
+appender.console.filter.1.regex = .*Thrift error occurred during processing of message.*
+appender.console.filter.1.onMatch = deny
+appender.console.filter.1.onMismatch = neutral
diff --git a/docker/integ-test/spark/spark-defaults.conf b/docker/integ-test/spark/spark-defaults.conf
new file mode 100644
index 000000000..acd49625b
--- /dev/null
+++ b/docker/integ-test/spark/spark-defaults.conf
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master spark://master:7077
+# spark.eventLog.enabled true
+# spark.eventLog.dir hdfs://namenode:8021/directory
+# spark.serializer org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory 5g
+# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+spark.sql.catalogImplementation hive
+spark.hadoop.hive.metastore.uris thrift://metastore:9083
+spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions,org.opensearch.flint.spark.FlintSparkExtensions
+spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog
+spark.sql.catalog.mys3 org.opensearch.sql.FlintDelegatingSessionCatalog
+spark.datasource.flint.host opensearch
+spark.datasource.flint.port 9200
+spark.datasource.flint.scheme http
+spark.datasource.flint.auth basic
+spark.datasource.flint.auth.username admin
+spark.datasource.flint.auth.password C0rrecthorsebatterystaple.
+spark.sql.warehouse.dir s3a://integ-test
+spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
+spark.hadoop.fs.s3a.path.style.access true
+spark.hadoop.fs.s3a.access.key Vt7jnvi5BICr1rkfsheT
+spark.hadoop.fs.s3a.secret.key 5NK3StGvoGCLUWvbaGN0LBUf9N6sjE94PEzLdqwO
+spark.hadoop.fs.s3a.endpoint http://minio-S3:9000
+spark.hadoop.fs.s3a.connection.ssl.enabled false
\ No newline at end of file
diff --git a/docker/integ-test/spark-master-entrypoint.sh b/docker/integ-test/spark/spark-master-entrypoint.sh
similarity index 88%
rename from docker/integ-test/spark-master-entrypoint.sh
rename to docker/integ-test/spark/spark-master-entrypoint.sh
index a21c20643..51caa787b 100755
--- a/docker/integ-test/spark-master-entrypoint.sh
+++ b/docker/integ-test/spark/spark-master-entrypoint.sh
@@ -1,5 +1,8 @@
#!/bin/bash
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+
function start_spark_connect() {
sc_version=$(ls -1 /opt/bitnami/spark/jars/spark-core_*.jar | sed -e 's/^.*\/spark-core_//' -e 's/\.jar$//' -e 's/-/:/')
diff --git a/docs/docker/integ-test/README.md b/docs/docker/integ-test/README.md
new file mode 100644
index 000000000..30bf2b1f2
--- /dev/null
+++ b/docs/docker/integ-test/README.md
@@ -0,0 +1,226 @@
+# Docker Cluster for Integration Testing
+
+## Introduction
+
+The docker cluster in `docker/integ-test` is designed to be used for integration testing. It supports the following
+use cases:
+1. Submitting queries directly to Spark in order to test the PPL extension for Spark.
+2. Submitting queries directly to Spark that use the OpenSearch datasource. Useful for testing the Flint extension
+ for Spark.
+3. Using the Async API to submit queries to the OpenSearch server. Useful for testing the EMR workflow and querying
+ S3/Glue datasources. A local container is run rather than using the AWS EMR service.
+
+The cluster consists of several containers and handles configuring them. No tables are created.
+
+## Overview
+
+
+
+All containers run in a dedicated docker network.
+
+### OpenSearch Dashboards
+
+An OpenSearch dashboards server that is connected to the OpenSearch server. It is exposed to the host OS,
+so it can be accessed with a browser.
+
+### OpenSearch
+
+An OpenSearch server. It is running in standalone mode. It is exposed to the host OS. It is configured to have
+an S3/Glue datasource with the name `mys3`. System indices and system indices permissions are disabled.
+
+This container also has a docker volume used to persist data such as local indices.
+
+### Spark
+
+The Spark master node. It is configured to use an external Hive metastore in the container `metastore`. The
+Spark master also has the Flint and PPL extensions installed. It can use locally built Jar files when building
+the docker image.
+
+Spark Connect is also running in this container and can be used to easily issue queries to run. The port for
+Spark Connect is exposed to the host OS.
+
+Spark is configured to have an OpenSearch datastore with the catalog name `dev`. Indices on the OpenSearch
+server can be queries as `dev.default.`.
+
+### Spark Worker
+
+The Spark worker node. It is configured to use an external Hive metastore in the container `metastore`. The
+Spark worker also has the Flint and PPL extensions installed. It can use locally built Jar files when building
+the docker image.
+
+### Spark Submit
+
+A temporary container that runs queries for an Async API session. It is started the OpenSearch container. It
+does not connect to the Spark cluster and instead runs the queries locally. It will keep looking for more
+queries to run until it reaches its timeout (3 minutes by default).
+
+The Spark submit container is configured to use an external Hive metastore in the container `metastore`. The
+Flint and PPL extensions are installed. When building the docker image, locally built Jar files can be used.
+
+### Metastore (Hive)
+
+A Hive server that is used as a metastore for the Spark containers. It is configured to use the Minio
+container in the bucket `integ-test`.
+
+This container also has a docker volume used to persist the metastore.
+
+### Minio (S3)
+
+A Minio server that acts as an S3 server. Is used as a part of the workflow of executing an S3/Glue query.
+It will contain the S3 tables data.
+
+This container also has a docker volume used to persist the S3 data.
+
+### Configuration-Updater
+
+A temporary container that is used to configure the OpenSearch and Minio containers. It is run after both
+of those have started up. For Minio, it will add the `integ-test` bucket and create an access key. For
+OpenSearch, it will create the S3/Glue datasource and apply a cluster configuration.
+
+## Running the Cluster
+
+To start the cluster go to the directory `docker/integ-test` and use docker compose to start the cluster. When
+starting the cluster, wait for the `spark-worker` container to finish starting up. It is the last container
+to start.
+
+Start cluster in foreground:
+```shell
+docker compose up
+```
+
+Start cluster in the background:
+```shell
+docker compose up -d
+```
+
+Stopping the cluster:
+```shell
+docker compose down -d
+```
+
+## Creating Tables in S3
+
+Tables need to be created in Spark as external tables. Their location must be set to a path under `s3a://integ-test/`.
+Can use `spark-shell` on the Spark master container to do this:
+```shell
+docker exec it spark spark-shell
+```
+
+Example for creating a table and adding data:
+```scala
+spark.sql("CREATE EXTERNAL TABLE foo (id int, name varchar(100)) location 's3a://integ-test/foo'")
+spark.sql("INSERT INTO foo (id, name) VALUES(1, 'Foo')")
+```
+
+## Querying an S3 Table
+
+A REST call to the OpenSearch container can be used to query the table using the Async API.
+
+[Async Query Creation API](https://github.com/opensearch-project/sql/blob/main/docs/user/interfaces/asyncqueryinterface.rst#async-query-creation-api)
+```shell
+curl \
+ -u 'admin:C0rrecthorsebatterystaple.' \
+ -X POST \
+ -H 'Content-Type: application/json' \
+ -d '{"datasource": "mys3", "lang": "sql", "query": "SELECT * FROM mys3.default.foo"}' \
+ http://localhost:9200/_plugins/_async_query
+```
+
+Sample response:
+```json
+{
+ "queryId": "HlbM61kX6MDkAktO",
+ "sessionId": "1Giy65ZnzNlmsPAm"
+}
+```
+
+When the query is finished, the results can be retrieved with a REST call to the OpenSearch container.
+
+[Async Query Result API](https://github.com/opensearch-project/sql/blob/main/docs/user/interfaces/asyncqueryinterface.rst#async-query-result-api)
+```shell
+curl \
+ -u 'admin:C0rrecthorsebatterystaple.' \
+ -X GET \
+ 'http://localhost:9200/_plugins/_async_query/HlbM61kX6MDkAktO'
+```
+
+Sample response:
+```json
+{
+ "status": "SUCCESS",
+ "schema": [
+ {
+ "name": "id",
+ "type": "integer"
+ },
+ {
+ "name": "name",
+ "type": "string"
+ }
+ ],
+ "datarows": [
+ [
+ 1,
+ "Foo"
+ ]
+ ],
+ "total": 1,
+ "size": 1
+}
+```
+
+## Configuration of the Cluster
+
+There are several settings that can be adjusted for the cluster.
+
+* SPARK_VERSION - the tag of the `bitnami/spark` docker image to use
+* OPENSEARCH_VERSION - the tag of the `opensearchproject/opensearch` docker image to use
+* DASHBOARDS_VERSION - the tag of the `opensearchproject/opensearch-dashboards` docker image to use
+* MASTER_UI_PORT - port on the host OS to map to the master UI port (8080) of the Spark master
+* MASTER_PORT - port on the host OS to map to the master port (7077) on the Spark master
+* UI_PORT - port on the host OS to map to the UI port (4040) on the Spark master
+* SPARK_CONNECT_PORT - port on the host OS to map to the Spark Connect port (15002) on the Spark master
+* PPL_JAR - The relative path to the PPL extension Jar file. Must be within the base directory of this repository
+* FLINT_JAR - The relative path to the Flint extension Jar file. Must be within the base directory of this
+ repository
+* SQL_APP_JAR - The relative path to the SQL application Jar file. Must be within the base directory of this
+ repository
+* OPENSEARCH_NODE_MEMORY - Amount of memory to allocate for the OpenSearch server
+* OPENSEARCH_ADMIN_PASSWORD - Password for the admin user of the OpenSearch server
+* OPENSEARCH_PORT - port on the host OS to map to port 9200 on the OpenSearch server
+* OPENSEARCH_PA_PORT - port on the host OS to map to the performance analyzer port (9600) on the OpenSearch
+ server
+* OPENSEARCH_DASHBOARDS_PORT - port on the host OS to map to the OpenSearch dashboards server
+* S3_ACCESS_KEY - access key to create on the Minio container
+* S3_SECRET_KEY - secret key to create on the Minio container
+
+## Async API Overview
+
+[Async API Interfaces](https://github.com/opensearch-project/sql/blob/main/docs/user/interfaces/asyncqueryinterface.rst)
+
+[Async API Documentation](https://opensearch.org/docs/latest/search-plugins/async/index/)
+
+The Async API is able to query S3/Glue datasources. This is done by calling the AWS EMR service to use a
+docker container to run the query. The docker container uses Spark and is able to access the Glue catalog and
+retrieve data from S3.
+
+For the docker cluster, Minio is used in place of S3. Docker itself is used in place of AWS EMR.
+
+
+
+1. Client submit a request to the async_search API endpoint
+2. OpenSearch server creates a special index (if it doesn't exist). This index is used to store async API requests
+ along with some state information.
+3. OpenSearch server checks if the query is for an S3/Glue datasource. If it is not, then OpenSearch can handle
+ the request on its own.
+4. OpenSearch uses docker to start a new container to process queries for the current async API session.
+5. OpenSearch returns the queryId and sessionId to the Client.
+6. Spark submit docker container starts up.
+7. Spark submit docker container searches for index from step 2 for a query in the current session to run.
+8. Spark submit docker container creates a special OpenSearch index (if it doesn't exist). This index is used to
+ store the results of the async API queries.
+9. Spark submit docker container looks up the table metadata from the `metastore` container.
+10. Spark submit docker container retrieves the data from the Minio container.
+11. Spark submit docker container writes the results to the OpenSearch index from step 7.
+12. Client submits a request to the async_search results API endpoint using the queryId form step 5.
+13. OpenSearch returns the results to the Client.
diff --git a/docs/docker/integ-test/images/OpenSearch_Async_API.png b/docs/docker/integ-test/images/OpenSearch_Async_API.png
new file mode 100644
index 000000000..882f73902
Binary files /dev/null and b/docs/docker/integ-test/images/OpenSearch_Async_API.png differ
diff --git a/docs/docker/integ-test/images/integ-test-containers.png b/docs/docker/integ-test/images/integ-test-containers.png
new file mode 100644
index 000000000..92878e187
Binary files /dev/null and b/docs/docker/integ-test/images/integ-test-containers.png differ
diff --git a/docs/spark-docker.md b/docs/docker/spark-docker.md
similarity index 96%
rename from docs/spark-docker.md
rename to docs/docker/spark-docker.md
index d1200e2b3..9c505f1cd 100644
--- a/docs/spark-docker.md
+++ b/docs/docker/spark-docker.md
@@ -19,7 +19,7 @@ sbt clean
sbt assembly
```
-Refer to the [Developer Guide](../DEVELOPER_GUIDE.md) for more information.
+Refer to the [Developer Guide](../../DEVELOPER_GUIDE.md) for more information.
## Using Docker Compose
@@ -65,7 +65,7 @@ spark.sql("INSERT INTO test_table (id, name) VALUES(2, 'Bar')")
spark.sql("source=test_table | eval x = id + 5 | fields x, name").show()
```
-For further information, see the [Spark PPL Test Instructions](ppl-lang/local-spark-ppl-test-instruction.md)
+For further information, see the [Spark PPL Test Instructions](../ppl-lang/local-spark-ppl-test-instruction.md)
## Manual Setup
diff --git a/docs/spark-emr-docker.md b/docs/docker/spark-emr-docker.md
similarity index 98%
rename from docs/spark-emr-docker.md
rename to docs/docker/spark-emr-docker.md
index 7eef4d250..e56295736 100644
--- a/docs/spark-emr-docker.md
+++ b/docs/docker/spark-emr-docker.md
@@ -18,7 +18,7 @@ sbt clean
sbt assembly
```
-Refer to the [Developer Guide](../DEVELOPER_GUIDE.md) for more information.
+Refer to the [Developer Guide](../../DEVELOPER_GUIDE.md) for more information.
## Using Docker Compose