diff --git a/docker/integ-test/.env b/docker/integ-test/.env index cf73bdc89..7d8995956 100644 --- a/docker/integ-test/.env +++ b/docker/integ-test/.env @@ -5,9 +5,13 @@ MASTER_UI_PORT=8080 MASTER_PORT=7077 UI_PORT=4040 SPARK_CONNECT_PORT=15002 -PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar -FLINT_JAR=../../flint-spark-integration/target/scala-2.12/flint-spark-integration-assembly-0.7.0-SNAPSHOT.jar +PPL_JAR=./ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar +FLINT_JAR=./flint-spark-integration/target/scala-2.12/flint-spark-integration-assembly-0.7.0-SNAPSHOT.jar +SQL_APP_JAR=./spark-sql-application/target/scala-2.12/sql-job-assembly-0.7.0-SNAPSHOT.jar OPENSEARCH_NODE_MEMORY=512m OPENSEARCH_ADMIN_PASSWORD=C0rrecthorsebatterystaple. OPENSEARCH_PORT=9200 +OPENSEARCH_PA_PORT=9600 OPENSEARCH_DASHBOARDS_PORT=5601 +S3_ACCESS_KEY=Vt7jnvi5BICr1rkfsheT +S3_SECRET_KEY=5NK3StGvoGCLUWvbaGN0LBUf9N6sjE94PEzLdqwO diff --git a/docker/integ-test/configuration-updater/apply-configuration.sh b/docker/integ-test/configuration-updater/apply-configuration.sh new file mode 100644 index 000000000..7946c75cd --- /dev/null +++ b/docker/integ-test/configuration-updater/apply-configuration.sh @@ -0,0 +1,73 @@ +/bin/sh + +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 + +# Login to Minio +curl -q \ + -c /tmp/minio-cookies.txt \ + -H 'Content-Type: application/json' \ + -d '{"accessKey": "minioadmin", "secretKey": "minioadmin"}' \ + http://minio-S3:9001/api/v1/login +# Delete the test bucket +curl -b /tmp/minio-cookies.txt \ + -X DELETE \ + http://minio-S3:9001/api/v1/buckets/test +# Create the integ-test bucket +curl -q \ + -b /tmp/minio-cookies.txt \ + -X POST \ + -H 'Content-Type: application/json' \ + -d '{"name": "integ-test", "versioning": {"enabled": true, "excludePrefixes": [], "excludeFolders": false}, "locking": true}' \ + http://minio-S3:9001/api/v1/buckets +# Create the access key +curl -q \ + -b /tmp/minio-cookies.txt \ + -X POST \ + -H 'Content-Type: application/json' \ + -d "{\"policy\": \"\", \"accessKey\": \"${S3_ACCESS_KEY}\", \"secretKey\": \"${S3_SECRET_KEY}\", \"description\": \"\", \"comment\": \"\", \"name\": \"\", \"expiry\": null}" \ + http://minio-S3:9001/api/v1/service-account-credentials + +# Login to OpenSearch Dashboards +echo ">>> Login to OpenSearch dashboards" +curl -q \ + -c /tmp/opensearch-cookies.txt \ + -X POST \ + -H 'Content-Type: application/json' \ + -H 'Osd-Version: 2.18.0' \ + -H 'Osd-Xsrf: fetch' \ + -d "{\"username\": \"admin\", \"password\": \"${OPENSEARCH_ADMIN_PASSWORD}\"}" \ + 'http://opensearch-dashboards:5601/auth/login?dataSourceId=' +if [ "$?" -eq "0" ]; then + echo " >>> Login successful" +else + echo " >>> Login failed" +fi +# Create the S3/Glue datasource +echo ">>> Creating datasource" +curl -q \ + -b /tmp/opensearch-cookies.txt \ + -X POST \ + -H 'Content-Type: application/json' \ + -H 'Osd-Version: 2.18.0' \ + -H 'Osd-Xsrf: fetch' \ + -d "{\"name\": \"mys3\", \"allowedRoles\": [], \"connector\": \"s3glue\", \"properties\": {\"glue.auth.type\": \"iam_role\", \"glue.auth.role_arn\": \"arn:aws:iam::123456789012:role/S3Access\", \"glue.indexstore.opensearch.uri\": \"http://opensearch:9200\", \"glue.indexstore.opensearch.auth\": \"basicauth\", \"glue.indexstore.opensearch.auth.username\": \"admin\", \"glue.indexstore.opensearch.auth.password\": \"${OPENSEARCH_ADMIN_PASSWORD}\"}}" \ + http://opensearch-dashboards:5601/api/directquery/dataconnections +if [ "$?" -eq "0" ]; then + echo " >>> S3 datasource created" +else + echo " >>> Failed to create S3 datasource" +fi + +echo ">>> Setting cluster settings" +curl -v \ + -u "admin:${OPENSEARCH_ADMIN_PASSWORD}" \ + -X PUT \ + -H 'Content-Type: application/json' \ + -d '{"persistent": {"plugins.query.executionengine.spark.config": "{\"applicationId\":\"integ-test\",\"executionRoleARN\":\"arn:aws:iam::xxxxx:role/emr-job-execution-role\",\"region\":\"us-west-2\", \"sparkSubmitParameters\": \"--conf spark.dynamicAllocation.enabled=false\"}"}}' \ + http://opensearch:9200/_cluster/settings +if [ "$?" -eq "0" ]; then + echo " >>> Successfully set cluster settings" +else + echo " >>> Failed to set cluster settings" +fi diff --git a/docker/integ-test/docker-compose.yml b/docker/integ-test/docker-compose.yml index c5ee53d7d..9fe79dc22 100644 --- a/docker/integ-test/docker-compose.yml +++ b/docker/integ-test/docker-compose.yml @@ -1,13 +1,35 @@ services: + metastore: + build: ./metastore + container_name: metastore + ports: + - "${THRIFT_PORT:-9083}:9083" + volumes: + - type: bind + source: ./metastore/hive-site.xml + target: /opt/apache-hive-2.3.9-bin/conf/hive-site.xml + - type: bind + source: ./metastore/hive-log4j2.properties + target: /opt/apache-hive-2.3.9-bin/conf/hive-log4j2.properties + - type: volume + source: metastore-data + target: /data + networks: + - opensearch-net + spark: - image: bitnami/spark:${SPARK_VERSION:-3.5.3} + build: + context: ./spark + dockerfile: Dockerfile + args: + SPARK_VERSION: ${SPARK_VERSION:-3.5.3} container_name: spark + entrypoint: /opt/bitnami/scripts/spark/spark-master-entrypoint.sh ports: - "${MASTER_UI_PORT:-8080}:8080" - "${MASTER_PORT:-7077}:7077" - "${UI_PORT:-4040}:4040" - "${SPARK_CONNECT_PORT}:15002" - entrypoint: /opt/bitnami/scripts/spark/master-entrypoint.sh environment: - SPARK_MODE=master - SPARK_RPC_AUTHENTICATION_ENABLED=no @@ -17,19 +39,10 @@ services: - SPARK_PUBLIC_DNS=localhost volumes: - type: bind - source: ./spark-master-entrypoint.sh - target: /opt/bitnami/scripts/spark/master-entrypoint.sh - - type: bind - source: ./spark-defaults.conf - target: /opt/bitnami/spark/conf/spark-defaults.conf - - type: bind - source: ./log4j2.properties - target: /opt/bitnami/spark/conf/log4j2.properties - - type: bind - source: $PPL_JAR + source: ../../$PPL_JAR target: /opt/bitnami/spark/jars/ppl-spark-integration.jar - type: bind - source: $FLINT_JAR + source: ../../$FLINT_JAR target: /opt/bitnami/spark/jars/flint-spark-integration.jar healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8080/"] @@ -40,9 +53,22 @@ services: start_interval: 5s networks: - opensearch-net + depends_on: + metastore: + condition: service_started + opensearch: + condition: service_healthy + opensearch-dashboards: + condition: service_healthy + configuration-updater: + condition: service_completed_successfully spark-worker: - image: bitnami/spark:${SPARK_VERSION:-3.5.3} + build: + context: ./spark + dockerfile: Dockerfile + args: + SPARK_VERSION: ${SPARK_VERSION:-3.5.3} container_name: spark-worker environment: - SPARK_MODE=worker @@ -56,32 +82,43 @@ services: - SPARK_PUBLIC_DNS=localhost volumes: - type: bind - source: ./spark-defaults.conf - target: /opt/bitnami/spark/conf/spark-defaults.conf - - type: bind - source: ./log4j2.properties - target: /opt/bitnami/spark/conf/log4j2.properties - - type: bind - source: $PPL_JAR + source: ../../$PPL_JAR target: /opt/bitnami/spark/jars/ppl-spark-integration.jar - type: bind - source: $FLINT_JAR + source: ../../$FLINT_JAR target: /opt/bitnami/spark/jars/flint-spark-integration.jar networks: - opensearch-net depends_on: - - spark + metastore: + condition: service_started + spark: + condition: service_healthy + + spark-submit: + build: + context: ../../ + dockerfile: docker/integ-test/spark-submit/Dockerfile + args: + FLINT_JAR: ${FLINT_JAR} + PPL_JAR: ${PPL_JAR} + SQL_APP_JAR: ${SQL_APP_JAR} + depends_on: + metastore: + condition: service_completed_successfully opensearch: - image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-latest} + build: ./opensearch container_name: opensearch environment: - cluster.name=opensearch-cluster - node.name=opensearch - - discovery.seed_hosts=opensearch - - cluster.initial_cluster_manager_nodes=opensearch + - discovery.type=single-node - bootstrap.memory_lock=true + - plugins.security.system_indices.enabled=false + - plugins.security.system_indices.permission.enabled=false - plugins.security.ssl.http.enabled=false + - plugins.query.datasources.encryption.masterkey=9a515c99d4313f140a6607053502f4d6 - OPENSEARCH_JAVA_OPTS=-Xms${OPENSEARCH_NODE_MEMORY:-512m} -Xmx${OPENSEARCH_NODE_MEMORY:-512m} - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_ADMIN_PASSWORD} ulimits: @@ -92,12 +129,18 @@ services: soft: 65536 hard: 65536 volumes: - - opensearch-data:/usr/share/opensearch/data + - type: volume + source: opensearch-data + target: /usr/share/opensearch/data + - type: bind + source: /var/run/docker.sock + target: /var/run/docker.sock ports: - ${OPENSEARCH_PORT:-9200}:9200 - - 9600:9600 + - ${OPENSEARCH_PA_PORT:-9600}:9600 expose: - "${OPENSEARCH_PORT:-9200}" + - "9300" healthcheck: test: ["CMD", "curl", "-f", "-u", "admin:${OPENSEARCH_ADMIN_PASSWORD}", "http://localhost:9200/_cluster/health"] interval: 1m @@ -107,6 +150,9 @@ services: start_interval: 5s networks: - opensearch-net + depends_on: + minio: + condition: service_healthy opensearch-dashboards: image: opensearchproject/opensearch-dashboards:${DASHBOARDS_VERSION} @@ -119,8 +165,16 @@ services: OPENSEARCH_HOSTS: '["http://opensearch:9200"]' networks: - opensearch-net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:5601/"] + interval: 1m + timeout: 5s + retries: 3 + start_period: 30s + start_interval: 5s depends_on: - - opensearch + opensearch: + condition: service_healthy minio: image: minio/minio @@ -132,12 +186,37 @@ services: - "9001:9001" volumes: - minio-data:/data + healthcheck: + test: ["CMD", "curl", "-q", "-f", "http://localhost:9000/minio/health/live"] + interval: 1m + timeout: 5s + retries: 3 + start_period: 30s + start_interval: 5s + networks: + - opensearch-net + + configuration-updater: + image: alpine/curl:latest + entrypoint: /bin/sh + command: /apply-configuration.sh + environment: + - S3_ACCESS_KEY=${S3_ACCESS_KEY} + - S3_SECRET_KEY=${S3_SECRET_KEY} + - OPENSEARCH_ADMIN_PASSWORD=${OPENSEARCH_ADMIN_PASSWORD} + volumes: + - type: bind + source: configuration-updater/apply-configuration.sh + target: /apply-configuration.sh + depends_on: + opensearch-dashboards: + condition: service_healthy networks: - opensearch-net volumes: + metastore-data: opensearch-data: minio-data: - networks: opensearch-net: diff --git a/docker/integ-test/metastore/Dockerfile b/docker/integ-test/metastore/Dockerfile new file mode 100644 index 000000000..79a7d725c --- /dev/null +++ b/docker/integ-test/metastore/Dockerfile @@ -0,0 +1,29 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 + +FROM openjdk:21-jdk-bookworm + +WORKDIR /opt + +ENV HADOOP_HOME=/opt/hadoop-3.3.4 +ENV HIVE_HOME=/opt/apache-hive-2.3.9-bin + +#RUN apt-get update +RUN curl -L https://archive.apache.org/dist/hive/hive-2.3.9/apache-hive-2.3.9-bin.tar.gz | tar zxf - +RUN curl -L https://archive.apache.org/dist/hadoop/common/hadoop-3.3.4/hadoop-3.3.4.tar.gz | tar zxf - +RUN cp $HADOOP_HOME/share/hadoop/client/hadoop-client-api-3.3.4.jar $HIVE_HOME/lib/ +RUN cp $HADOOP_HOME/share/hadoop/client/hadoop-client-runtime-3.3.4.jar $HIVE_HOME/lib/ +RUN cp $HADOOP_HOME/share/hadoop/tools/lib/hadoop-aws-3.3.4.jar $HIVE_HOME/lib/ +RUN cp $HADOOP_HOME/share/hadoop/tools/lib/aws-java-sdk-bundle-1.12.262.jar $HIVE_HOME/lib/ + +RUN groupadd -f -r hive --gid=1000 +RUN useradd -r -g hive --uid=1000 -d ${HIVE_HOME} hive +RUN chown hive:hive -R ${HIVE_HOME} + +RUN mkdir /data +RUN chown hive:hive /data + +WORKDIR $HIVE_HOME +EXPOSE 9083 +ENTRYPOINT ["/opt/apache-hive-2.3.9-bin/bin/hive", "--service", "metastore"] +USER hive diff --git a/docker/integ-test/metastore/hive-log4j2.properties b/docker/integ-test/metastore/hive-log4j2.properties new file mode 100644 index 000000000..7e6a3b08e --- /dev/null +++ b/docker/integ-test/metastore/hive-log4j2.properties @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +status = INFO +name = HiveLog4j2 +packages = org.apache.hadoop.hive.ql.log + +# list of properties +property.hive.log.level = INFO +property.hive.root.logger = console +property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name} +property.hive.log.file = hive.log +property.hive.perflogger.log.level = INFO + +# list of all appenders +appenders = console + +# console appender +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{ISO8601} %5p [%t] %c{2}: %m%n + +# list of all loggers +loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX, PerfLogger + +logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn +logger.NIOServerCnxn.level = WARN + +logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO +logger.ClientCnxnSocketNIO.level = WARN + +logger.DataNucleus.name = DataNucleus +logger.DataNucleus.level = ERROR + +logger.Datastore.name = Datastore +logger.Datastore.level = ERROR + +logger.JPOX.name = JPOX +logger.JPOX.level = ERROR + +logger.PerfLogger.name = org.apache.hadoop.hive.ql.log.PerfLogger +logger.PerfLogger.level = ${sys:hive.perflogger.log.level} + +# root logger +rootLogger.level = ${sys:hive.log.level} +rootLogger.appenderRefs = root +rootLogger.appenderRef.root.ref = ${sys:hive.root.logger} diff --git a/docker/integ-test/metastore/hive-site.xml b/docker/integ-test/metastore/hive-site.xml new file mode 100644 index 000000000..e235306eb --- /dev/null +++ b/docker/integ-test/metastore/hive-site.xml @@ -0,0 +1,53 @@ + + + + + hive.metastore.schema.verification + false + + + hive.metastore.warehouse.dir + file:///tmp + + + + fs.default.name + file:///tmp + + + + javax.jdo.option.ConnectionURL + jdbc:derby:;databaseName=/data/metastore_db;create=true + + javax.jdo.option.ConnectionDriverName + org.apache.derby.jdbc.EmbeddedDriver + + + datanucleus.schema.autoCreateTables + true + + + fs.s3a.impl + org.apache.hadoop.fs.s3a.S3AFileSystem + + + fs.s3a.path.style.access + true + + + fs.s3a.access.key + Vt7jnvi5BICr1rkfsheT + + + fs.s3a.secret.key + 5NK3StGvoGCLUWvbaGN0LBUf9N6sjE94PEzLdqwO + + + fs.s3a.endpoint + http://minio-S3:9000 + + + fs.s3a.connection.ssl.enabled + false + + diff --git a/docker/integ-test/opensearch/Dockerfile b/docker/integ-test/opensearch/Dockerfile new file mode 100644 index 000000000..da042516e --- /dev/null +++ b/docker/integ-test/opensearch/Dockerfile @@ -0,0 +1,44 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 + +FROM opensearchproject/opensearch:latest + +USER root + +RUN mkdir /tmp/alter-emr-jar +WORKDIR /tmp/alter-emr-jar + +ENV AWS_VERSION=1.12.651 + +RUN curl -O -L https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-emrserverless/${AWS_VERSION}/aws-java-sdk-emrserverless-${AWS_VERSION}.jar +RUN curl -O -L https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/${AWS_VERSION}/aws-java-sdk-core-${AWS_VERSION}.jar + +COPY emr-src /tmp/alter-emr-jar/emr-src +WORKDIR /tmp/alter-emr-jar/emr-src +RUN /usr/share/opensearch/jdk/bin/javac -cp ../aws-java-sdk-emrserverless-${AWS_VERSION}.jar:../aws-java-sdk-core-${AWS_VERSION}.jar com/amazonaws/services/emrserverless/AWSEMRServerlessClientBuilder.java org/opensearch/spark/emrserverless/DockerEMRServerlessClient.java +RUN mkdir /tmp/alter-emr-jar/extracted +WORKDIR /tmp/alter-emr-jar/extracted +RUN /usr/share/opensearch/jdk/bin/jar -xf ../aws-java-sdk-emrserverless-${AWS_VERSION}.jar +RUN cp ../emr-src/com/amazonaws/services/emrserverless/AWSEMRServerlessClientBuilder.class com/amazonaws/services/emrserverless/ +RUN mkdir -p org/opensearch/spark/emrserverless +RUN cp ../emr-src/org/opensearch/spark/emrserverless/DockerEMRServerlessClient.class org/opensearch/spark/emrserverless/ +RUN /usr/share/opensearch/jdk/bin/jar -cfM /usr/share/opensearch/plugins/opensearch-sql/aws-java-sdk-emrserverless-*.jar META-INF/MANIFEST.MF * +RUN chown opensearch:opensearch /usr/share/opensearch/plugins/opensearch-sql/aws-java-sdk-emrserverless-*.jar +RUN rm -rf /tmp/alter-emr-jar + +RUN yum install -y docker util-linux + +COPY opensearch-docker-it-entrypoint.sh /usr/share/opensearch/opensearch-docker-it-entrypoint.sh +COPY docker-command-runner.sh /usr/share/opensearch/docker-command-runner.sh +COPY opensearch_security.policy /usr/share/opensearch/config/opensearch-performance-analyzer/opensearch_security.policy +COPY log4j2.properties /usr/share/opensearch/config/log4j2.properties + +RUN chown opensearch:opensearch /usr/share/opensearch/config/opensearch-performance-analyzer/opensearch_security.policy +RUN chown opensearch:opensearch /usr/share/opensearch/config/log4j2.properties + +WORKDIR /usr/share/opensearch +ENTRYPOINT ["./opensearch-docker-it-entrypoint.sh"] +CMD ["opensearch"] + +EXPOSE 9200 +EXPOSE 9300 diff --git a/docker/integ-test/opensearch/docker-command-runner.sh b/docker/integ-test/opensearch/docker-command-runner.sh new file mode 100755 index 000000000..25c1927ca --- /dev/null +++ b/docker/integ-test/opensearch/docker-command-runner.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 + +function process_files { + for cmd_file in `ls -1`; do + echo "$cmd_file" | grep -q 'cmd$' + if [ "$?" -eq "0" ]; then + stdout_filename=$(echo $cmd_file | sed -e 's/cmd$/stdout/') + stderr_filename=$(echo $cmd_file | sed -e 's/cmd$/stderr/') + exit_code_filename=$(echo $cmd_file | sed -e 's/cmd/exitCode/') + + /usr/bin/docker $(cat $cmd_file) > $stdout_filename 2> $stderr_filename + echo "$?" > $exit_code_filename + + rm $cmd_file + fi + done +} + +if [ ! -d '/tmp/docker' ]; then + mkdir /tmp/docker + chown opensearch:opensearch /tmp/docker +fi + +cd /tmp/docker +while true; do + process_files + sleep 1 +done + diff --git a/docker/integ-test/opensearch/emr-src/com/amazonaws/services/emrserverless/AWSEMRServerlessClientBuilder.java b/docker/integ-test/opensearch/emr-src/com/amazonaws/services/emrserverless/AWSEMRServerlessClientBuilder.java new file mode 100644 index 000000000..4666b494f --- /dev/null +++ b/docker/integ-test/opensearch/emr-src/com/amazonaws/services/emrserverless/AWSEMRServerlessClientBuilder.java @@ -0,0 +1,34 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package com.amazonaws.services.emrserverless; + +import com.amazonaws.ClientConfigurationFactory; +import com.amazonaws.client.AwsSyncClientParams; +import com.amazonaws.client.builder.AwsSyncClientBuilder; + +import org.opensearch.spark.emrserverless.DockerEMRServerlessClient; + +public class AWSEMRServerlessClientBuilder extends AwsSyncClientBuilder { + private static final ClientConfigurationFactory CLIENT_CONFIG_FACTORY = new ClientConfigurationFactory(); + + private AWSEMRServerlessClientBuilder() { + super(CLIENT_CONFIG_FACTORY); + } + + public static AWSEMRServerlessClientBuilder standard() { + return new AWSEMRServerlessClientBuilder(); + } + + public static AWSEMRServerless defaultClient() { + return (AWSEMRServerless) standard().build(); + } + + protected AWSEMRServerless build(AwsSyncClientParams params) { + DockerEMRServerlessClient client = new DockerEMRServerlessClient(CLIENT_CONFIG_FACTORY.getConfig()); + client.setServiceNameIntern("emr"); + return client; + } +} diff --git a/docker/integ-test/opensearch/emr-src/org/opensearch/spark/emrserverless/DockerEMRServerlessClient.java b/docker/integ-test/opensearch/emr-src/org/opensearch/spark/emrserverless/DockerEMRServerlessClient.java new file mode 100644 index 000000000..88cc448a9 --- /dev/null +++ b/docker/integ-test/opensearch/emr-src/org/opensearch/spark/emrserverless/DockerEMRServerlessClient.java @@ -0,0 +1,216 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.spark.emrserverless; + +import com.amazonaws.AmazonWebServiceClient; +import com.amazonaws.AmazonWebServiceRequest; +import com.amazonaws.ClientConfiguration; +import com.amazonaws.ResponseMetadata; +import com.amazonaws.services.emrserverless.AWSEMRServerless; +import com.amazonaws.services.emrserverless.model.CancelJobRunRequest; +import com.amazonaws.services.emrserverless.model.CancelJobRunResult; +import com.amazonaws.services.emrserverless.model.CreateApplicationRequest; +import com.amazonaws.services.emrserverless.model.CreateApplicationResult; +import com.amazonaws.services.emrserverless.model.DeleteApplicationRequest; +import com.amazonaws.services.emrserverless.model.DeleteApplicationResult; +import com.amazonaws.services.emrserverless.model.GetApplicationRequest; +import com.amazonaws.services.emrserverless.model.GetApplicationResult; +import com.amazonaws.services.emrserverless.model.GetDashboardForJobRunRequest; +import com.amazonaws.services.emrserverless.model.GetDashboardForJobRunResult; +import com.amazonaws.services.emrserverless.model.GetJobRunRequest; +import com.amazonaws.services.emrserverless.model.GetJobRunResult; +import com.amazonaws.services.emrserverless.model.ListApplicationsRequest; +import com.amazonaws.services.emrserverless.model.ListApplicationsResult; +import com.amazonaws.services.emrserverless.model.ListJobRunsRequest; +import com.amazonaws.services.emrserverless.model.ListJobRunsResult; +import com.amazonaws.services.emrserverless.model.ListTagsForResourceRequest; +import com.amazonaws.services.emrserverless.model.ListTagsForResourceResult; +import com.amazonaws.services.emrserverless.model.StartApplicationRequest; +import com.amazonaws.services.emrserverless.model.StartApplicationResult; +import com.amazonaws.services.emrserverless.model.StartJobRunRequest; +import com.amazonaws.services.emrserverless.model.StartJobRunResult; +import com.amazonaws.services.emrserverless.model.StopApplicationRequest; +import com.amazonaws.services.emrserverless.model.StopApplicationResult; +import com.amazonaws.services.emrserverless.model.TagResourceRequest; +import com.amazonaws.services.emrserverless.model.TagResourceResult; +import com.amazonaws.services.emrserverless.model.UntagResourceRequest; +import com.amazonaws.services.emrserverless.model.UntagResourceResult; +import com.amazonaws.services.emrserverless.model.UpdateApplicationRequest; +import com.amazonaws.services.emrserverless.model.UpdateApplicationResult; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +public class DockerEMRServerlessClient extends AmazonWebServiceClient implements AWSEMRServerless { + private static final AtomicInteger JOB_ID = new AtomicInteger(1); + + public DockerEMRServerlessClient(ClientConfiguration clientConfiguration) { + super(clientConfiguration); + setEndpointPrefix("emr"); + } + + @Override + public CancelJobRunResult cancelJobRun(final CancelJobRunRequest cancelJobRunRequest) { + return null; + } + + @Override + public CreateApplicationResult createApplication(final CreateApplicationRequest createApplicationRequest) { + return null; + } + + @Override + public DeleteApplicationResult deleteApplication(final DeleteApplicationRequest deleteApplicationRequest) { + return null; + } + + @Override + public GetApplicationResult getApplication(final GetApplicationRequest getApplicationRequest) { + return null; + } + + @Override + public GetDashboardForJobRunResult getDashboardForJobRun( + final GetDashboardForJobRunRequest getDashboardForJobRunRequest) { + return null; + } + + @Override + public GetJobRunResult getJobRun(final GetJobRunRequest getJobRunRequest) { + return null; + } + + @Override + public ListApplicationsResult listApplications(final ListApplicationsRequest listApplicationsRequest) { + return null; + } + + @Override + public ListJobRunsResult listJobRuns(final ListJobRunsRequest listJobRunsRequest) { + return null; + } + + @Override + public ListTagsForResourceResult listTagsForResource( + final ListTagsForResourceRequest listTagsForResourceRequest) { + return null; + } + + @Override + public StartApplicationResult startApplication(final StartApplicationRequest startApplicationRequest) { + return null; + } + + @Override + public StartJobRunResult startJobRun(final StartJobRunRequest startJobRunRequest) { + String entryPoint = startJobRunRequest.getJobDriver().getSparkSubmit().getEntryPoint(); + List entryPointArguments = startJobRunRequest.getJobDriver().getSparkSubmit().getEntryPointArguments(); + String sparkSubmitParameters = startJobRunRequest.getJobDriver().getSparkSubmit().getSparkSubmitParameters(); + + final int jobId = JOB_ID.getAndIncrement(); + + List runContainerCmd = new ArrayList<>(); + runContainerCmd.add("run"); + runContainerCmd.add("-d"); + runContainerCmd.add("--rm"); + runContainerCmd.add("--env"); + runContainerCmd.add("SERVERLESS_EMR_JOB_ID=" + jobId); + runContainerCmd.add("--network"); + runContainerCmd.add("integ-test_opensearch-net"); + runContainerCmd.add("integ-test-spark-submit:latest"); + runContainerCmd.add("/opt/bitnami/spark/bin/spark-submit"); + runContainerCmd.add("--deploy-mode"); + runContainerCmd.add("client"); + runContainerCmd.add("--exclude-packages"); + runContainerCmd.add("org.opensearch:opensearch-spark-standalone_2.12,org.opensearch:opensearch-spark-sql-application_2.12,org.opensearch:opensearch-spark-ppl_2.12"); + runContainerCmd.add("--master"); + runContainerCmd.add("local[2]"); + + runContainerCmd.addAll(Arrays.asList(sparkSubmitParameters.split(" "))); + runContainerCmd.addAll(entryPointArguments); + + final List cmd = runContainerCmd.stream().filter(s -> !s.isBlank()).collect(Collectors.toList()); + + for (int i = 1; i < cmd.size(); i++) { + if (cmd.get(i - 1).equals("--conf")) { + if (cmd.get(i).startsWith("spark.datasource.flint.customAWSCredentialsProvider=") || + cmd.get(i).startsWith("spark.datasource.flint.") || + cmd.get(i).startsWith("spark.hadoop.hive.metastore.client.factory.class=")) { + cmd.remove(i - 1); + cmd.remove(i - 1); + i -= 2; + } else if (cmd.get(i).startsWith("spark.emr-serverless.driverEnv.JAVA_HOME=")) { + cmd.set(i, "spark.emr-serverless.driverEnv.JAVA_HOME=/opt/bitnami/java"); + } else if (cmd.get(i).startsWith("spark.executorEnv.JAVA_HOME=")) { + cmd.set(i, "spark.executorEnv.JAVA_HOME=/opt/bitnami/java"); + } + } + } + + cmd.add(cmd.size() - 1, "/app/spark-sql-application.jar"); + + System.out.println(">>> " + String.join(" ", cmd)); + + try { + File dockerDir = new File("/tmp/docker"); + File cmdFile = File.createTempFile("docker_", ".cmd", dockerDir); + FileOutputStream fos = new FileOutputStream(cmdFile); + fos.write(String.join(" ", cmd).getBytes(StandardCharsets.UTF_8)); + fos.close(); + + String cmdFilename = cmdFile.getName(); + String filenameBase = cmdFilename.substring(7, cmdFilename.length() - 4); + File exitCodeFile = new File(dockerDir, "docker_" + filenameBase + ".exitCode"); + for (int i = 0; i < 600 && !exitCodeFile.exists(); i++) { + Thread.sleep(100L); + } + + if (exitCodeFile.exists()) { + StartJobRunResult startJobResult = new StartJobRunResult(); + startJobResult.setApplicationId(startJobRunRequest.getApplicationId()); + startJobResult.setArn("arn:aws:emr-containers:foo:123456789012:/virtualclusters/0/jobruns/" + jobId); + startJobResult.setJobRunId(Integer.toString(jobId)); + return startJobResult; + } + } catch (IOException | InterruptedException e) { + e.printStackTrace(); + } + + return null; + } + + @Override + public StopApplicationResult stopApplication(final StopApplicationRequest stopApplicationRequest) { + return null; + } + + @Override + public TagResourceResult tagResource(final TagResourceRequest tagResourceRequest) { + return null; + } + + @Override + public UntagResourceResult untagResource(final UntagResourceRequest untagResourceRequest) { + return null; + } + + @Override + public UpdateApplicationResult updateApplication(final UpdateApplicationRequest updateApplicationRequest) { + return null; + } + + @Override + public ResponseMetadata getCachedResponseMetadata(final AmazonWebServiceRequest amazonWebServiceRequest) { + return null; + } +} diff --git a/docker/integ-test/log4j2.properties b/docker/integ-test/opensearch/log4j2.properties similarity index 100% rename from docker/integ-test/log4j2.properties rename to docker/integ-test/opensearch/log4j2.properties diff --git a/docker/integ-test/opensearch/opensearch-docker-it-entrypoint.sh b/docker/integ-test/opensearch/opensearch-docker-it-entrypoint.sh new file mode 100755 index 000000000..277f531a9 --- /dev/null +++ b/docker/integ-test/opensearch/opensearch-docker-it-entrypoint.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 + +./docker-command-runner.sh & +echo $! > /var/run/docker-command-runner.pid + +su opensearch ./opensearch-docker-entrypoint.sh "$@" + +kill -TERM `cat /var/run/docker-command-runner.pid` diff --git a/docker/integ-test/opensearch/opensearch_security.policy b/docker/integ-test/opensearch/opensearch_security.policy new file mode 100644 index 000000000..8e4de3246 --- /dev/null +++ b/docker/integ-test/opensearch/opensearch_security.policy @@ -0,0 +1,18 @@ +grant { + permission java.lang.management.ManagementPermission "control"; + permission java.net.SocketPermission "localhost:9600","connect,resolve"; + permission java.lang.RuntimePermission "getClassLoader"; + permission java.io.FilePermission "/tmp/docker/-", "read,write,delete"; +}; + +grant codebase "file:${java.home}/../lib/tools.jar" { + permission java.security.AllPermission; +}; + +grant codeBase "jrt:/jdk.attach" { + permission java.security.AllPermission; +}; + +grant codeBase "jrt:/jdk.internal.jvmstat" { + permission java.security.AllPermission; +}; diff --git a/docker/integ-test/spark-submit/Dockerfile b/docker/integ-test/spark-submit/Dockerfile new file mode 100644 index 000000000..d61caad52 --- /dev/null +++ b/docker/integ-test/spark-submit/Dockerfile @@ -0,0 +1,20 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 + +FROM bitnami/spark:3.5.3 +ARG FLINT_JAR +ARG PPL_JAR +ARG SQL_APP_JAR +ENV FLINT_JAR $FLINT_JAR +ENV PPL_JAR $PPL_JAR +ENV SQL_APP_JAR $SQL_APP_JAR + +COPY docker/integ-test/spark/spark-defaults.conf /opt/bitnami/spark/conf/spark-defaults.conf +COPY ${FLINT_JAR} /opt/bitnami/spark/jars/flint-spark-integration.jar +COPY ${PPL_JAR} /opt/bitnami/spark/jars/ppl-spark-integration.jar + +USER root +RUN mkdir /app +COPY ${SQL_APP_JAR} /app/spark-sql-application.jar + +USER 1001 diff --git a/docker/integ-test/spark-defaults.conf b/docker/integ-test/spark-submit/spark-defaults.conf similarity index 74% rename from docker/integ-test/spark-defaults.conf rename to docker/integ-test/spark-submit/spark-defaults.conf index 19b9e4ec1..acd49625b 100644 --- a/docker/integ-test/spark-defaults.conf +++ b/docker/integ-test/spark-submit/spark-defaults.conf @@ -25,11 +25,21 @@ # spark.serializer org.apache.spark.serializer.KryoSerializer # spark.driver.memory 5g # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" +spark.sql.catalogImplementation hive +spark.hadoop.hive.metastore.uris thrift://metastore:9083 spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions,org.opensearch.flint.spark.FlintSparkExtensions spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog +spark.sql.catalog.mys3 org.opensearch.sql.FlintDelegatingSessionCatalog spark.datasource.flint.host opensearch spark.datasource.flint.port 9200 spark.datasource.flint.scheme http spark.datasource.flint.auth basic spark.datasource.flint.auth.username admin spark.datasource.flint.auth.password C0rrecthorsebatterystaple. +spark.sql.warehouse.dir s3a://integ-test +spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem +spark.hadoop.fs.s3a.path.style.access true +spark.hadoop.fs.s3a.access.key Vt7jnvi5BICr1rkfsheT +spark.hadoop.fs.s3a.secret.key 5NK3StGvoGCLUWvbaGN0LBUf9N6sjE94PEzLdqwO +spark.hadoop.fs.s3a.endpoint http://minio-S3:9000 +spark.hadoop.fs.s3a.connection.ssl.enabled false \ No newline at end of file diff --git a/docker/integ-test/spark/Dockerfile b/docker/integ-test/spark/Dockerfile new file mode 100644 index 000000000..f180320a6 --- /dev/null +++ b/docker/integ-test/spark/Dockerfile @@ -0,0 +1,14 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 + +ARG SPARK_VERSION=3.5.3 +FROM bitnami/spark:${SPARK_VERSION} + +USER root +RUN apt update +RUN apt install -y curl + +USER 1001 +COPY ./spark-defaults.conf /opt/bitnami/spark/conf/spark-defaults.conf +COPY ./log4j2.properties /opt/bitnami/spark/conf/log4j2.properties +COPY ./spark-master-entrypoint.sh /opt/bitnami/scripts/spark/spark-master-entrypoint.sh diff --git a/docker/integ-test/spark/log4j2.properties b/docker/integ-test/spark/log4j2.properties new file mode 100644 index 000000000..ab96e03ba --- /dev/null +++ b/docker/integ-test/spark/log4j2.properties @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +rootLogger.level = info +rootLogger.appenderRef.stdout.ref = console + +# In the pattern layout configuration below, we specify an explicit `%ex` conversion +# pattern for logging Throwables. If this was omitted, then (by default) Log4J would +# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional +# class packaging information. That extra information can sometimes add a substantial +# performance overhead, so we disable it in our default logging config. +# For more information, see SPARK-39361. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex + +# Set the default spark-shell/spark-sql log level to WARN. When running the +# spark-shell/spark-sql, the log level for these classes is used to overwrite +# the root logger's log level, so that the user can have different defaults +# for the shell and regular Spark apps. +logger.repl.name = org.apache.spark.repl.Main +logger.repl.level = warn + +logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver +logger.thriftserver.level = warn + +# Settings to quiet third party logs that are too verbose +logger.jetty1.name = org.sparkproject.jetty +logger.jetty1.level = warn +logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle +logger.jetty2.level = error +logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper +logger.replexprTyper.level = info +logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter +logger.replSparkILoopInterpreter.level = info +logger.parquet1.name = org.apache.parquet +logger.parquet1.level = error +logger.parquet2.name = parquet +logger.parquet2.level = error + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler +logger.RetryingHMSHandler.level = fatal +logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry +logger.FunctionRegistry.level = error + +# For deploying Spark ThriftServer +# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 +appender.console.filter.1.type = RegexFilter +appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* +appender.console.filter.1.onMatch = deny +appender.console.filter.1.onMismatch = neutral diff --git a/docker/integ-test/spark/spark-defaults.conf b/docker/integ-test/spark/spark-defaults.conf new file mode 100644 index 000000000..acd49625b --- /dev/null +++ b/docker/integ-test/spark/spark-defaults.conf @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" +spark.sql.catalogImplementation hive +spark.hadoop.hive.metastore.uris thrift://metastore:9083 +spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions,org.opensearch.flint.spark.FlintSparkExtensions +spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog +spark.sql.catalog.mys3 org.opensearch.sql.FlintDelegatingSessionCatalog +spark.datasource.flint.host opensearch +spark.datasource.flint.port 9200 +spark.datasource.flint.scheme http +spark.datasource.flint.auth basic +spark.datasource.flint.auth.username admin +spark.datasource.flint.auth.password C0rrecthorsebatterystaple. +spark.sql.warehouse.dir s3a://integ-test +spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem +spark.hadoop.fs.s3a.path.style.access true +spark.hadoop.fs.s3a.access.key Vt7jnvi5BICr1rkfsheT +spark.hadoop.fs.s3a.secret.key 5NK3StGvoGCLUWvbaGN0LBUf9N6sjE94PEzLdqwO +spark.hadoop.fs.s3a.endpoint http://minio-S3:9000 +spark.hadoop.fs.s3a.connection.ssl.enabled false \ No newline at end of file diff --git a/docker/integ-test/spark-master-entrypoint.sh b/docker/integ-test/spark/spark-master-entrypoint.sh similarity index 88% rename from docker/integ-test/spark-master-entrypoint.sh rename to docker/integ-test/spark/spark-master-entrypoint.sh index a21c20643..51caa787b 100755 --- a/docker/integ-test/spark-master-entrypoint.sh +++ b/docker/integ-test/spark/spark-master-entrypoint.sh @@ -1,5 +1,8 @@ #!/bin/bash +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 + function start_spark_connect() { sc_version=$(ls -1 /opt/bitnami/spark/jars/spark-core_*.jar | sed -e 's/^.*\/spark-core_//' -e 's/\.jar$//' -e 's/-/:/') diff --git a/docs/docker/integ-test/README.md b/docs/docker/integ-test/README.md new file mode 100644 index 000000000..30bf2b1f2 --- /dev/null +++ b/docs/docker/integ-test/README.md @@ -0,0 +1,226 @@ +# Docker Cluster for Integration Testing + +## Introduction + +The docker cluster in `docker/integ-test` is designed to be used for integration testing. It supports the following +use cases: +1. Submitting queries directly to Spark in order to test the PPL extension for Spark. +2. Submitting queries directly to Spark that use the OpenSearch datasource. Useful for testing the Flint extension + for Spark. +3. Using the Async API to submit queries to the OpenSearch server. Useful for testing the EMR workflow and querying + S3/Glue datasources. A local container is run rather than using the AWS EMR service. + +The cluster consists of several containers and handles configuring them. No tables are created. + +## Overview + +![Docker Containers](images/integ-test-containers.png "Docker Containers") + +All containers run in a dedicated docker network. + +### OpenSearch Dashboards + +An OpenSearch dashboards server that is connected to the OpenSearch server. It is exposed to the host OS, +so it can be accessed with a browser. + +### OpenSearch + +An OpenSearch server. It is running in standalone mode. It is exposed to the host OS. It is configured to have +an S3/Glue datasource with the name `mys3`. System indices and system indices permissions are disabled. + +This container also has a docker volume used to persist data such as local indices. + +### Spark + +The Spark master node. It is configured to use an external Hive metastore in the container `metastore`. The +Spark master also has the Flint and PPL extensions installed. It can use locally built Jar files when building +the docker image. + +Spark Connect is also running in this container and can be used to easily issue queries to run. The port for +Spark Connect is exposed to the host OS. + +Spark is configured to have an OpenSearch datastore with the catalog name `dev`. Indices on the OpenSearch +server can be queries as `dev.default.`. + +### Spark Worker + +The Spark worker node. It is configured to use an external Hive metastore in the container `metastore`. The +Spark worker also has the Flint and PPL extensions installed. It can use locally built Jar files when building +the docker image. + +### Spark Submit + +A temporary container that runs queries for an Async API session. It is started the OpenSearch container. It +does not connect to the Spark cluster and instead runs the queries locally. It will keep looking for more +queries to run until it reaches its timeout (3 minutes by default). + +The Spark submit container is configured to use an external Hive metastore in the container `metastore`. The +Flint and PPL extensions are installed. When building the docker image, locally built Jar files can be used. + +### Metastore (Hive) + +A Hive server that is used as a metastore for the Spark containers. It is configured to use the Minio +container in the bucket `integ-test`. + +This container also has a docker volume used to persist the metastore. + +### Minio (S3) + +A Minio server that acts as an S3 server. Is used as a part of the workflow of executing an S3/Glue query. +It will contain the S3 tables data. + +This container also has a docker volume used to persist the S3 data. + +### Configuration-Updater + +A temporary container that is used to configure the OpenSearch and Minio containers. It is run after both +of those have started up. For Minio, it will add the `integ-test` bucket and create an access key. For +OpenSearch, it will create the S3/Glue datasource and apply a cluster configuration. + +## Running the Cluster + +To start the cluster go to the directory `docker/integ-test` and use docker compose to start the cluster. When +starting the cluster, wait for the `spark-worker` container to finish starting up. It is the last container +to start. + +Start cluster in foreground: +```shell +docker compose up +``` + +Start cluster in the background: +```shell +docker compose up -d +``` + +Stopping the cluster: +```shell +docker compose down -d +``` + +## Creating Tables in S3 + +Tables need to be created in Spark as external tables. Their location must be set to a path under `s3a://integ-test/`. +Can use `spark-shell` on the Spark master container to do this: +```shell +docker exec it spark spark-shell +``` + +Example for creating a table and adding data: +```scala +spark.sql("CREATE EXTERNAL TABLE foo (id int, name varchar(100)) location 's3a://integ-test/foo'") +spark.sql("INSERT INTO foo (id, name) VALUES(1, 'Foo')") +``` + +## Querying an S3 Table + +A REST call to the OpenSearch container can be used to query the table using the Async API. + +[Async Query Creation API](https://github.com/opensearch-project/sql/blob/main/docs/user/interfaces/asyncqueryinterface.rst#async-query-creation-api) +```shell +curl \ + -u 'admin:C0rrecthorsebatterystaple.' \ + -X POST \ + -H 'Content-Type: application/json' \ + -d '{"datasource": "mys3", "lang": "sql", "query": "SELECT * FROM mys3.default.foo"}' \ + http://localhost:9200/_plugins/_async_query +``` + +Sample response: +```json +{ + "queryId": "HlbM61kX6MDkAktO", + "sessionId": "1Giy65ZnzNlmsPAm" +} +``` + +When the query is finished, the results can be retrieved with a REST call to the OpenSearch container. + +[Async Query Result API](https://github.com/opensearch-project/sql/blob/main/docs/user/interfaces/asyncqueryinterface.rst#async-query-result-api) +```shell +curl \ + -u 'admin:C0rrecthorsebatterystaple.' \ + -X GET \ + 'http://localhost:9200/_plugins/_async_query/HlbM61kX6MDkAktO' +``` + +Sample response: +```json +{ + "status": "SUCCESS", + "schema": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "name", + "type": "string" + } + ], + "datarows": [ + [ + 1, + "Foo" + ] + ], + "total": 1, + "size": 1 +} +``` + +## Configuration of the Cluster + +There are several settings that can be adjusted for the cluster. + +* SPARK_VERSION - the tag of the `bitnami/spark` docker image to use +* OPENSEARCH_VERSION - the tag of the `opensearchproject/opensearch` docker image to use +* DASHBOARDS_VERSION - the tag of the `opensearchproject/opensearch-dashboards` docker image to use +* MASTER_UI_PORT - port on the host OS to map to the master UI port (8080) of the Spark master +* MASTER_PORT - port on the host OS to map to the master port (7077) on the Spark master +* UI_PORT - port on the host OS to map to the UI port (4040) on the Spark master +* SPARK_CONNECT_PORT - port on the host OS to map to the Spark Connect port (15002) on the Spark master +* PPL_JAR - The relative path to the PPL extension Jar file. Must be within the base directory of this repository +* FLINT_JAR - The relative path to the Flint extension Jar file. Must be within the base directory of this + repository +* SQL_APP_JAR - The relative path to the SQL application Jar file. Must be within the base directory of this + repository +* OPENSEARCH_NODE_MEMORY - Amount of memory to allocate for the OpenSearch server +* OPENSEARCH_ADMIN_PASSWORD - Password for the admin user of the OpenSearch server +* OPENSEARCH_PORT - port on the host OS to map to port 9200 on the OpenSearch server +* OPENSEARCH_PA_PORT - port on the host OS to map to the performance analyzer port (9600) on the OpenSearch + server +* OPENSEARCH_DASHBOARDS_PORT - port on the host OS to map to the OpenSearch dashboards server +* S3_ACCESS_KEY - access key to create on the Minio container +* S3_SECRET_KEY - secret key to create on the Minio container + +## Async API Overview + +[Async API Interfaces](https://github.com/opensearch-project/sql/blob/main/docs/user/interfaces/asyncqueryinterface.rst) + +[Async API Documentation](https://opensearch.org/docs/latest/search-plugins/async/index/) + +The Async API is able to query S3/Glue datasources. This is done by calling the AWS EMR service to use a +docker container to run the query. The docker container uses Spark and is able to access the Glue catalog and +retrieve data from S3. + +For the docker cluster, Minio is used in place of S3. Docker itself is used in place of AWS EMR. + +![OpenSearch Async API Sequence Diagram](images/OpenSearch_Async_API.png "Sequence Diagram") + +1. Client submit a request to the async_search API endpoint +2. OpenSearch server creates a special index (if it doesn't exist). This index is used to store async API requests + along with some state information. +3. OpenSearch server checks if the query is for an S3/Glue datasource. If it is not, then OpenSearch can handle + the request on its own. +4. OpenSearch uses docker to start a new container to process queries for the current async API session. +5. OpenSearch returns the queryId and sessionId to the Client. +6. Spark submit docker container starts up. +7. Spark submit docker container searches for index from step 2 for a query in the current session to run. +8. Spark submit docker container creates a special OpenSearch index (if it doesn't exist). This index is used to + store the results of the async API queries. +9. Spark submit docker container looks up the table metadata from the `metastore` container. +10. Spark submit docker container retrieves the data from the Minio container. +11. Spark submit docker container writes the results to the OpenSearch index from step 7. +12. Client submits a request to the async_search results API endpoint using the queryId form step 5. +13. OpenSearch returns the results to the Client. diff --git a/docs/docker/integ-test/images/OpenSearch_Async_API.png b/docs/docker/integ-test/images/OpenSearch_Async_API.png new file mode 100644 index 000000000..882f73902 Binary files /dev/null and b/docs/docker/integ-test/images/OpenSearch_Async_API.png differ diff --git a/docs/docker/integ-test/images/integ-test-containers.png b/docs/docker/integ-test/images/integ-test-containers.png new file mode 100644 index 000000000..92878e187 Binary files /dev/null and b/docs/docker/integ-test/images/integ-test-containers.png differ diff --git a/docs/spark-docker.md b/docs/docker/spark-docker.md similarity index 96% rename from docs/spark-docker.md rename to docs/docker/spark-docker.md index d1200e2b3..9c505f1cd 100644 --- a/docs/spark-docker.md +++ b/docs/docker/spark-docker.md @@ -19,7 +19,7 @@ sbt clean sbt assembly ``` -Refer to the [Developer Guide](../DEVELOPER_GUIDE.md) for more information. +Refer to the [Developer Guide](../../DEVELOPER_GUIDE.md) for more information. ## Using Docker Compose @@ -65,7 +65,7 @@ spark.sql("INSERT INTO test_table (id, name) VALUES(2, 'Bar')") spark.sql("source=test_table | eval x = id + 5 | fields x, name").show() ``` -For further information, see the [Spark PPL Test Instructions](ppl-lang/local-spark-ppl-test-instruction.md) +For further information, see the [Spark PPL Test Instructions](../ppl-lang/local-spark-ppl-test-instruction.md) ## Manual Setup diff --git a/docs/spark-emr-docker.md b/docs/docker/spark-emr-docker.md similarity index 98% rename from docs/spark-emr-docker.md rename to docs/docker/spark-emr-docker.md index 7eef4d250..e56295736 100644 --- a/docs/spark-emr-docker.md +++ b/docs/docker/spark-emr-docker.md @@ -18,7 +18,7 @@ sbt clean sbt assembly ``` -Refer to the [Developer Guide](../DEVELOPER_GUIDE.md) for more information. +Refer to the [Developer Guide](../../DEVELOPER_GUIDE.md) for more information. ## Using Docker Compose