Skip to content

Commit

Permalink
all
Browse files Browse the repository at this point in the history
  • Loading branch information
lzyy2024 committed Feb 4, 2025
1 parent 53a82f1 commit 124faf8
Show file tree
Hide file tree
Showing 10 changed files with 577 additions and 7 deletions.
241 changes: 240 additions & 1 deletion be/src/vec/functions/function_compress.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,242 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// Created by lzy on 25-1-16.
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <glog/logging.h>

#include <array>
#include <cctype>
#include <cstddef>
#include <cstring>
#include <functional>
#include <memory>
#include <string>
#include <utility>

#include "common/status.h"
#include "util/block_compression.h"
#include "util/faststring.h"
#include "vec/aggregate_functions/aggregate_function.h"
#include "vec/columns/column.h"
#include "vec/columns/column_nullable.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/columns/columns_number.h"
#include "vec/common/assert_cast.h"
#include "vec/core/block.h"
#include "vec/core/column_numbers.h"
#include "vec/core/column_with_type_and_name.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/data_type_number.h"
#include "vec/data_types/data_type_string.h"
#include "vec/functions/function.h"
#include "vec/functions/simple_function_factory.h"

namespace doris {
class FunctionContext;
} // namespace doris

namespace doris::vectorized {

class FunctionCompress : public IFunction {
static constexpr std::array<char, 16> HEX_ITOC = {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};

public:
static constexpr auto name = "compress";
static FunctionPtr create() { return std::make_shared<FunctionCompress>(); }

String get_name() const override { return name; }

size_t get_number_of_arguments() const override { return 1; }

DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
return std::make_shared<DataTypeString>();
}

Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const override {
// Get the compression algorithm object
BlockCompressionCodec* compression_codec;
RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
&compression_codec));

const auto& arg_column =
assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);
auto result_column = ColumnString::create();

auto& arg_data = arg_column.get_chars();
auto& arg_offset = arg_column.get_offsets();
const char* arg_begin = reinterpret_cast<const char*>(arg_data.data());

auto& col_data = result_column->get_chars();
auto& col_offset = result_column->get_offsets();
col_offset.resize(input_rows_count);

faststring compressed_str;
Slice data;

// When the original string is large, the result is roughly this value
size_t total = arg_offset[input_rows_count - 1];
col_data.reserve(total / 1000);

for (size_t row = 0; row < input_rows_count; row++) {
size_t length = arg_offset[row] - arg_offset[row - 1];
data = Slice(arg_begin + arg_offset[row - 1], length);

size_t idx = col_data.size();
if (!length) { // data is ''
col_data.resize(col_data.size() + 2);
col_data[idx] = '0', col_data[idx + 1] = 'x';
col_offset[row] = col_offset[row - 1] + 2;
continue;
}

// Z_MEM_ERROR and Z_BUF_ERROR are already handled in compress, making sure st is always Z_OK
auto st = compression_codec->compress(data, &compressed_str);
col_data.resize(col_data.size() + compressed_str.size());

// first ten digits represent the length of the uncompressed string
col_data[idx] = '0', col_data[idx + 1] = 'x';
for (size_t i = 0; i < 4; i++) {
unsigned char byte = (length >> (i * 8)) & 0xFF;
col_data[idx + 2 + i * 2] = HEX_ITOC[byte >> 4]; // higher four
col_data[idx + 3 + i * 2] = HEX_ITOC[byte & 0x0F];
}
idx += 10;

// The length of compress_str is not known in advance, so it cannot be compressed directly into col_data
unsigned char* src = compressed_str.data();
for (size_t i = 0; i < compressed_str.size(); idx++, i++, src++) {
col_data[idx] = *src;
}
col_offset[row] = col_offset[row - 1] + 10 + compressed_str.size();
}

block.replace_by_position(result, std::move(result_column));
return Status::OK();
}
};

class FunctionUncompress : public IFunction {
public:
static constexpr auto name = "uncompress";
static FunctionPtr create() { return std::make_shared<FunctionUncompress>(); }

String get_name() const override { return name; }

size_t get_number_of_arguments() const override { return 1; }

DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
return make_nullable(std::make_shared<DataTypeString>());
}

Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const override {
// Get the compression algorithm object
BlockCompressionCodec* compression_codec;
RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
&compression_codec));

const auto& arg_column =
assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);

auto& arg_data = arg_column.get_chars();
auto& arg_offset = arg_column.get_offsets();
const char* arg_begin = reinterpret_cast<const char*>(arg_data.data());

auto result_column = ColumnString::create();
auto& col_data = result_column->get_chars();
auto& col_offset = result_column->get_offsets();
col_offset.resize(input_rows_count);

auto null_column = ColumnUInt8::create(input_rows_count);
auto& null_map = null_column->get_data();

std::string uncompressed;
Slice data;
Slice uncompressed_slice;

size_t total = arg_offset[input_rows_count - 1];
col_data.reserve(total * 1000);

for (size_t row = 0; row < input_rows_count; row++) {
null_map[row] = false;
data = Slice(arg_begin + arg_offset[row - 1], arg_offset[row] - arg_offset[row - 1]);
size_t data_length = arg_offset[row] - arg_offset[row - 1];

if (data_length == 0) { // The original data is ''
col_offset[row] = col_offset[row - 1];
continue;
}

bool illegal = false;
// The first ten digits are "0x" and length, followed by hexadecimal, each two digits is a byte
if (data_length < 10) {
illegal = true;
} else {
if (data[0] != '0' || data[1] != 'x') {
illegal = true;
}
for (size_t i = 2; i <= 9; i++) {
if (!std::isxdigit(data[i])) {
illegal = true;
}
}
}

if (illegal) { // The top ten don't fit the rules
col_offset[row] = col_offset[row - 1];
null_map[row] = true;
continue;
}

unsigned int length = 0;
for (size_t i = 2; i <= 9; i += 2) {
unsigned char byte;
std::from_chars(data.data + i, data.data + i + 2, byte, 16);
length += (byte << (8 * (i / 2 - 1))); //Little Endian : 0x01000000 -> 1
}

size_t idx = col_data.size();
col_data.resize(col_data.size() + length);
uncompressed_slice = Slice(col_data.data() + idx, length);

Slice compressed_data(data.data + 10, data.size - 10);
auto st = compression_codec->decompress(compressed_data, &uncompressed_slice);

if (!st.ok()) { // is not a legal compressed string
col_data.resize(col_data.size() - length); // remove compressed_data
col_offset[row] = col_offset[row - 1];
null_map[row] = true;
continue;
}
col_offset[row] = col_offset[row - 1] + length;
}

block.replace_by_position(
result, ColumnNullable::create(std::move(result_column), std::move(null_column)));
return Status::OK();
}
};

void register_function_compress(SimpleFunctionFactory& factory) {
factory.register_function<FunctionCompress>();
factory.register_function<FunctionUncompress>();
}

} // namespace doris::vectorized
2 changes: 2 additions & 0 deletions be/src/vec/functions/simple_function_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ void register_function_ip(SimpleFunctionFactory& factory);
void register_function_multi_match(SimpleFunctionFactory& factory);
void register_function_split_by_regexp(SimpleFunctionFactory& factory);
void register_function_assert_true(SimpleFunctionFactory& factory);
void register_function_compress(SimpleFunctionFactory& factory);
void register_function_bit_test(SimpleFunctionFactory& factory);

class SimpleFunctionFactory {
Expand Down Expand Up @@ -301,6 +302,7 @@ class SimpleFunctionFactory {
register_function_split_by_regexp(instance);
register_function_assert_true(instance);
register_function_bit_test(instance);
register_function_compress(instance);
});
return instance;
}
Expand Down
2 changes: 1 addition & 1 deletion conf/be.conf
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ JAVA_OPTS="-Dfile.encoding=UTF-8 -Xmx2048m -DlogPath=$LOG_DIR/jni.log -Xloggc:$L
JAVA_OPTS_FOR_JDK_17="-Dfile.encoding=UTF-8 -Djol.skipHotspotSAAttach=true -Xmx2048m -DlogPath=$LOG_DIR/jni.log -Xlog:gc*:$LOG_DIR/be.gc.log.$CUR_DATE:time,uptime:filecount=10,filesize=50M -Djavax.security.auth.useSubjectCredsOnly=false -Dsun.security.krb5.debug=true -Dsun.java.command=DorisBE -XX:-CriticalJNINatives -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED --add-opens=java.management/sun.management=ALL-UNNAMED -Darrow.enable_null_check_for_get=false"

# Set your own JAVA_HOME
# JAVA_HOME=/path/to/jdk/
# JAVA_HOME=/home/lzy/doris_need/to/java/jdk-17.0.10

# https://github.com/apache/doris/blob/master/docs/zh-CN/community/developer-guide/debug-tool.md#jemalloc-heap-profile
# https://jemalloc.net/jemalloc.3.html
Expand Down
8 changes: 4 additions & 4 deletions conf/fe.conf
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ CUR_DATE=`date +%Y%m%d-%H%M%S`
LOG_DIR = ${DORIS_HOME}/log

# For jdk 8
JAVA_OPTS="-Dfile.encoding=UTF-8 -Djavax.security.auth.useSubjectCredsOnly=false -Xss4m -Xmx8192m -XX:+UnlockExperimentalVMOptions -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:+PrintGCDateStamps -XX:+PrintGCDetails -XX:+PrintClassHistogramAfterFullGC -Xloggc:$LOG_DIR/log/fe.gc.log.$CUR_DATE -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=50M -Dlog4j2.formatMsgNoLookups=true"
JAVA_OPTS="-Dfile.encoding=UTF-8 -Djavax.security.auth.useSubjectCredsOnly=false -Xss4m -Xmx16384m -XX:+UnlockExperimentalVMOptions -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:+PrintGCDateStamps -XX:+PrintGCDetails -XX:+PrintClassHistogramAfterFullGC -Xloggc:$LOG_DIR/log/fe.gc.log.$CUR_DATE -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=50M -Dlog4j2.formatMsgNoLookups=true"

# For jdk 17, this JAVA_OPTS will be used as default JVM options
JAVA_OPTS_FOR_JDK_17="-Dfile.encoding=UTF-8 -Djavax.security.auth.useSubjectCredsOnly=false -Xmx8192m -Xms8192m -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=$LOG_DIR -Xlog:gc*,classhisto*=trace:$LOG_DIR/fe.gc.log.$CUR_DATE:time,uptime:filecount=10,filesize=50M --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens java.base/jdk.internal.ref=ALL-UNNAMED"
JAVA_OPTS_FOR_JDK_17="-Dfile.encoding=UTF-8 -Djavax.security.auth.useSubjectCredsOnly=false -Xmx16384m -Xms16384m -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=$LOG_DIR -Xlog:gc*,classhisto*=trace:$LOG_DIR/fe.gc.log.$CUR_DATE:time,uptime:filecount=10,filesize=50M --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens java.base/jdk.internal.ref=ALL-UNNAMED"

# Set your own JAVA_HOME
# JAVA_HOME=/path/to/jdk/
# JAVA_HOME=/home/lzy/doris_need/to/java/jdk-17.0.10

##
## the lowercase properties are read by main program.
Expand All @@ -57,7 +57,7 @@ arrow_flight_sql_port = -1
# If no ip match this rule, will choose one randomly.
# use CIDR format, e.g. 10.10.10.0/24 or IP format, e.g. 10.10.10.1
# Default value is empty.
# priority_networks = 10.10.10.0/24;192.168.0.0/16
# priority_networks = 172.25.78.180/20

# Advanced configurations
# log_roll_size_mb = 1024
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.Char;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CharacterLength;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Coalesce;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Compress;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Concat;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ConcatWs;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ConnectionId;
Expand Down Expand Up @@ -449,6 +450,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim;
import org.apache.doris.nereids.trees.expressions.functions.scalar.TrimIn;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Truncate;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Uncompress;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Unhex;
import org.apache.doris.nereids.trees.expressions.functions.scalar.UnixTimestamp;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Upper;
Expand Down Expand Up @@ -974,7 +976,9 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(YearsSub.class, "years_sub"),
scalar(MultiMatch.class, "multi_match"),
scalar(SessionUser.class, "session_user"),
scalar(LastQueryId.class, "last_query_id"));
scalar(LastQueryId.class, "last_query_id"),
scalar(Compress.class, "compress"),
scalar(Uncompress.class, "uncompress"));

public static final BuiltinScalarFunctions INSTANCE = new BuiltinScalarFunctions();

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.trees.expressions.functions.scalar;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.StringType;
import org.apache.doris.nereids.types.VarcharType;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import java.util.List;

/**
* ScalarFunction 'compress'.
*/
public class Compress extends ScalarFunction
implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable {

public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
FunctionSignature.ret(VarcharType.SYSTEM_DEFAULT).args(VarcharType.SYSTEM_DEFAULT),
FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE));

/**
* constructor with 1 argument.
*/
public Compress(Expression arg) {
super("compress", arg);
}

/**
* withChildren.
*/
@Override
public Compress withChildren(List<Expression> children) {
Preconditions.checkArgument(children.size() == 1);
return new Compress(children.get(0));
}

@Override
public List<FunctionSignature> getSignatures() {
return SIGNATURES;
}

@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitCompress(this, context);
}
}
Loading

0 comments on commit 124faf8

Please sign in to comment.