Skip to content

Commit

Permalink
Add new variant types
Browse files Browse the repository at this point in the history
  • Loading branch information
gene-db committed Feb 5, 2025
1 parent 2056297 commit 1ea911c
Show file tree
Hide file tree
Showing 4 changed files with 298 additions and 6 deletions.
100 changes: 96 additions & 4 deletions parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@
import java.io.CharArrayWriter;
import java.io.IOException;
import java.math.BigDecimal;
import java.time.Instant;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.time.*;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.time.temporal.ChronoField;
Expand All @@ -53,6 +52,7 @@ public final class Variant {
* short list.
*/
static final int BINARY_SEARCH_THRESHOLD = 32;

static final ZoneId UTC = ZoneId.of("UTC");

public Variant(byte[] value, byte[] metadata) {
Expand Down Expand Up @@ -126,6 +126,13 @@ public byte[] getBinary() {
return VariantUtil.getBinary(value, pos);
}

/**
* @return the UUID value
*/
public byte[] getUUID() {
return VariantUtil.getUUID(value, pos);
}

/**
* @return the string value
*/
Expand Down Expand Up @@ -327,12 +334,32 @@ public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) {
.appendFraction(ChronoField.MICRO_OF_SECOND, 6, 6, true)
.toFormatter(Locale.US);

/** The format for a timestamp without time zone, with nanosecond precision. */
private static final DateTimeFormatter TIMESTAMP_NANOS_NTZ_FORMATTER = new DateTimeFormatterBuilder()
.append(DateTimeFormatter.ISO_LOCAL_DATE)
.appendLiteral('T')
.appendPattern("HH:mm:ss")
.appendFraction(ChronoField.NANO_OF_SECOND, 9, 9, true)
.toFormatter(Locale.US);

/** The format for a timestamp with time zone. */
private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder()
.append(TIMESTAMP_NTZ_FORMATTER)
.appendOffset("+HH:MM", "+00:00")
.toFormatter(Locale.US);

/** The format for a timestamp with time zone, with nanosecond precision. */
private static final DateTimeFormatter TIMESTAMP_NANOS_FORMATTER = new DateTimeFormatterBuilder()
.append(TIMESTAMP_NANOS_NTZ_FORMATTER)
.appendOffset("+HH:MM", "+00:00")
.toFormatter(Locale.US);

/** The format for a time. */
private static final DateTimeFormatter TIME_FORMATTER = new DateTimeFormatterBuilder()
.appendPattern("HH:mm:ss")
.appendFraction(ChronoField.MICRO_OF_SECOND, 6, 6, true)
.toFormatter(Locale.US);

/** The format for a timestamp without time zone, truncating trailing microsecond zeros. */
private static final DateTimeFormatter TIMESTAMP_NTZ_TRUNC_FORMATTER = new DateTimeFormatterBuilder()
.append(DateTimeFormatter.ISO_LOCAL_DATE)
Expand All @@ -343,16 +370,50 @@ public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) {
.optionalEnd()
.toFormatter(Locale.US);

/**
* The format for a timestamp without time zone, with nanosecond precision, truncating
* trailing nanosecond zeros.
*/
private static final DateTimeFormatter TIMESTAMP_NANOS_NTZ_TRUNC_FORMATTER = new DateTimeFormatterBuilder()
.append(DateTimeFormatter.ISO_LOCAL_DATE)
.appendLiteral('T')
.appendPattern("HH:mm:ss")
.optionalStart()
.appendFraction(ChronoField.NANO_OF_SECOND, 0, 9, true)
.optionalEnd()
.toFormatter(Locale.US);

/** The format for a timestamp with time zone, truncating trailing microsecond zeros. */
private static final DateTimeFormatter TIMESTAMP_TRUNC_FORMATTER = new DateTimeFormatterBuilder()
.append(TIMESTAMP_NTZ_TRUNC_FORMATTER)
.appendOffset("+HH:MM", "+00:00")
.toFormatter(Locale.US);

/**
* The format for a timestamp with time zone, with nanosecond precision, truncating trailing
* nanosecond zeros.
*/
private static final DateTimeFormatter TIMESTAMP_NANOS_TRUNC_FORMATTER = new DateTimeFormatterBuilder()
.append(TIMESTAMP_NANOS_NTZ_TRUNC_FORMATTER)
.appendOffset("+HH:MM", "+00:00")
.toFormatter(Locale.US);

/** The format for a time, truncating trailing microsecond zeros. */
private static final DateTimeFormatter TIME_TRUNC_FORMATTER = new DateTimeFormatterBuilder()
.appendPattern("HH:mm:ss")
.optionalStart()
.appendFraction(ChronoField.MICRO_OF_SECOND, 0, 6, true)
.optionalEnd()
.toFormatter(Locale.US);

private static Instant microsToInstant(long microsSinceEpoch) {
return Instant.EPOCH.plus(microsSinceEpoch, ChronoUnit.MICROS);
}

private static Instant nanosToInstant(long timestampNanos) {
return Instant.EPOCH.plus(timestampNanos, ChronoUnit.NANOS);
}

private static void toJsonImpl(
byte[] value, byte[] metadata, int pos, JsonGenerator gen, ZoneId zoneId, boolean truncateTrailingZeros)
throws IOException {
Expand Down Expand Up @@ -447,6 +508,37 @@ private static void toJsonImpl(
case BINARY:
gen.writeString(Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, pos)));
break;
case TIME:
if (truncateTrailingZeros) {
gen.writeString(TIME_TRUNC_FORMATTER.format(
LocalTime.ofNanoOfDay(VariantUtil.getLong(value, pos) * 1_000)));
} else {
gen.writeString(
TIME_FORMATTER.format(LocalTime.ofNanoOfDay(VariantUtil.getLong(value, pos) * 1_000)));
}
break;
case TIMESTAMP_NANOS:
if (truncateTrailingZeros) {
gen.writeString(TIMESTAMP_NANOS_TRUNC_FORMATTER.format(
nanosToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId)));
} else {
gen.writeString(TIMESTAMP_NANOS_FORMATTER.format(
nanosToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId)));
}
break;
case TIMESTAMP_NANOS_NTZ:
if (truncateTrailingZeros) {
gen.writeString(TIMESTAMP_NANOS_NTZ_TRUNC_FORMATTER.format(
nanosToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC)));
} else {
gen.writeString(TIMESTAMP_NANOS_NTZ_FORMATTER.format(
nanosToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC)));
}
break;
case UUID:
ByteBuffer bb = ByteBuffer.wrap(VariantUtil.getUUID(value, pos)).order(ByteOrder.BIG_ENDIAN);
gen.writeString(new java.util.UUID(bb.getLong(), bb.getLong()).toString());
break;
default:
throw new IllegalArgumentException("Unsupported type: " + VariantUtil.getType(value, pos));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,27 @@ public void appendTimestampNtz(long microsSinceEpoch) {
writePos += 8;
}

public void appendTime(long microsSinceMidnight) {
checkCapacity(1 + 8);
writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.TIME);
VariantUtil.writeLong(writeBuffer, writePos, microsSinceMidnight, 8);
writePos += 8;
}

public void appendTimestampNanos(long nanosSinceEpoch) {
checkCapacity(1 + 8);
writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.TIMESTAMP_NANOS);
VariantUtil.writeLong(writeBuffer, writePos, nanosSinceEpoch, 8);
writePos += 8;
}

public void appendTimestampNanosNtz(long nanosSinceEpoch) {
checkCapacity(1 + 8);
writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.TIMESTAMP_NANOS_NTZ);
VariantUtil.writeLong(writeBuffer, writePos, nanosSinceEpoch, 8);
writePos += 8;
}

public void appendFloat(float f) {
checkCapacity(1 + 4);
writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.FLOAT);
Expand All @@ -279,6 +300,14 @@ public void appendBinary(byte[] binary) {
writePos += binary.length;
}

public void appendUUID(byte[] uuid) {
assert uuid.length == VariantUtil.UUID_SIZE;
checkCapacity(1 + VariantUtil.UUID_SIZE);
writeBuffer[writePos++] = VariantUtil.primitiveHeader(VariantUtil.UUID);
System.arraycopy(uuid, 0, writeBuffer, writePos, uuid.length);
writePos += uuid.length;
}

/**
* Adds a key to the Variant dictionary. If the key already exists, the dictionary is unmodified.
* @param key the key to add
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,28 @@ public class VariantUtil {
* string size) + (size bytes of string content).
*/
public static final int LONG_STR = 16;
/**
* Time value. Values can be from 00:00:00 to 23:59:59.999999.
* Content is 8-byte little-endian unsigned integer that represents the number of microseconds
* since midnight.
*/
public static final int TIME = 17;
/**
* Timestamp nanos value. Similar to `TIMESTAMP`, but represents the number of nanoseconds
* elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC.
*/
public static final int TIMESTAMP_NANOS = 18;
/**
* Timestamp nanos (without timestamp) value. It has the same content as `TIMESTAMP_NANOS` but
* should always be interpreted as if the local time zone is UTC.
*/
public static final int TIMESTAMP_NANOS_NTZ = 19;
/**
* UUID value. The content is a 16-byte binary, encoded using big-endian.
* For example, UUID 00112233-4455-6677-8899-aabbccddeeff is encoded as the bytes
* 00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff.
*/
public static final int UUID = 20;

// The metadata version.
public static final byte VERSION = 1;
Expand All @@ -160,6 +182,9 @@ public class VariantUtil {
public static final int MAX_DECIMAL8_PRECISION = 18;
public static final int MAX_DECIMAL16_PRECISION = 38;

// The size (in bytes) of a UUID.
public static final int UUID_SIZE = 16;

// Default size limit for both variant value and variant metadata.
public static final int DEFAULT_SIZE_LIMIT = U24_MAX + 1;

Expand Down Expand Up @@ -276,6 +301,10 @@ public enum Type {
TIMESTAMP_NTZ,
FLOAT,
BINARY,
TIME,
TIMESTAMP_NANOS,
TIMESTAMP_NANOS_NTZ,
UUID
}

public static int getPrimitiveTypeId(byte[] value, int pos) {
Expand Down Expand Up @@ -332,6 +361,14 @@ public static Type getType(byte[] value, int pos) {
return Type.BINARY;
case LONG_STR:
return Type.STRING;
case TIME:
return Type.TIME;
case TIMESTAMP_NANOS:
return Type.TIMESTAMP_NANOS;
case TIMESTAMP_NANOS_NTZ:
return Type.TIMESTAMP_NANOS_NTZ;
case UUID:
return Type.UUID;
default:
throw unknownPrimitiveTypeInVariant(typeInfo);
}
Expand Down Expand Up @@ -383,6 +420,9 @@ public static int valueSize(byte[] value, int pos) {
case DOUBLE:
case TIMESTAMP:
case TIMESTAMP_NTZ:
case TIME:
case TIMESTAMP_NANOS:
case TIMESTAMP_NANOS_NTZ:
return 9;
case DECIMAL4:
return 6;
Expand All @@ -393,6 +433,8 @@ public static int valueSize(byte[] value, int pos) {
case BINARY:
case LONG_STR:
return 1 + U32_SIZE + readUnsigned(value, pos + 1, U32_SIZE);
case UUID:
return 1 + UUID_SIZE;
default:
throw unknownPrimitiveTypeInVariant(typeInfo);
}
Expand All @@ -416,11 +458,14 @@ public static boolean getBoolean(byte[] value, int pos) {
/**
* Returns a long value from Variant value `value[pos...]`.
* It is only legal to call it if `getType` returns one of Type.LONG, DATE, TIMESTAMP,
* TIMESTAMP_NTZ.
* TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ.
* If the type is `DATE`, the return value is guaranteed to fit into an int and
* represents the number of days from the Unix epoch.
* If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the number of
* microseconds from the Unix epoch.
* If the type is `TIME`, the return value represents the number of microseconds since midnight.
* If the type is `TIMESTAMP_NANOS/TIMESTAMP_NANOS_NTZ`, the return value represents the number of
* nanoseconds from the Unix epoch.
* @param value The Variant value
* @param pos The starting index of the Variant value
* @return The long value
Expand All @@ -442,6 +487,9 @@ public static long getLong(byte[] value, int pos) {
case INT8:
case TIMESTAMP:
case TIMESTAMP_NTZ:
case TIME:
case TIMESTAMP_NANOS:
case TIMESTAMP_NANOS_NTZ:
return readLong(value, pos + 1, 8);
default:
throw new IllegalStateException(exceptionMessage);
Expand Down Expand Up @@ -546,6 +594,16 @@ public static String getString(byte[] value, int pos) {
throw unexpectedType(Type.STRING);
}

public static byte[] getUUID(byte[] value, int pos) {
checkIndex(pos, value.length);
int basicType = value[pos] & BASIC_TYPE_MASK;
int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK;
if (basicType != PRIMITIVE || typeInfo != UUID) throw unexpectedType(Type.UUID);
int start = pos + 1;
checkIndex(start + UUID_SIZE - 1, value.length);
return Arrays.copyOfRange(value, start, start + UUID_SIZE);
}

/**
* An interface for the Variant object handler.
* @param <T> The return type of the handler
Expand Down
Loading

0 comments on commit 1ea911c

Please sign in to comment.