Skip to content

Commit d090bf8

Browse files
leochen4891claude
andauthored
feat: Support reading Parquet ENUM logical type as String (#7805)
Closes #7723 ## Background Parquet's `ENUM` logical type is physically identical to `STRING`: both annotate a `BINARY` column with UTF-8 encoded bytes. The only difference is the label. External tools such as Spark and PyArrow use `ENUM` to indicate a column holds a finite set of string values, but the wire format is the same. Deephaven's read pipeline has three stages where logical type is dispatched. All three previously had no handling for `EnumLogicalTypeAnnotation`, causing ENUM-annotated columns from externally produced files to fail on read. ## Changes ### Stage 1 — Schema to Java type (`ParquetSchemaReader`) **Before:** `visit(EnumLogicalTypeAnnotation)` set an error string and returned `Optional.empty()`, so the column was unresolvable. **After:** Returns `Optional.of(String.class)`, the same result as `visit(StringLogicalTypeAnnotation)`. ### Stage 2 — Column data to chunk (`ParquetColumnLocation`) **Before:** No `visit(EnumLogicalTypeAnnotation)` override existed, so the visitor returned `Optional.empty()` and the read failed at runtime. **After:** A new override delegates to `ToStringPage.create(...)`, the same decoder used for `STRING` columns. ### Stage 3 — Pushdown statistics (`MinMaxFromStatistics`) **Before:** `getMinMaxForStrings` only accepted `StringLogicalTypeAnnotation`, so ENUM columns returned `false` and forced a full scan on every filter. **After:** The condition adds `|| instanceof EnumLogicalTypeAnnotation`, enabling min/max pushdown for ENUM columns. ## Tests - `MinMaxFromStatisticsTest.enumLogicalStatisticsAreMaterialised` — unit test verifying ENUM statistics are extracted as strings. - `ParquetTableReadWriteTest.testReadEnumLogicalTypeAsString` — end-to-end test that writes a Parquet file with a `BINARY+ENUM` column and reads it back, verifying the column materializes as `String` with correct values. --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 69e6ea8 commit d090bf8

5 files changed

Lines changed: 87 additions & 19 deletions

File tree

extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetSchemaReader.java

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -361,22 +361,9 @@ private static LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<Class<?>> getV
361361
return new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<Class<?>>() {
362362
@Override
363363
public Optional<Class<?>> visit(final LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
364-
final ColumnDescriptor column = currentColumn.getValue();
365-
final String columnName = column.getPath()[0];
366-
final ColumnTypeInfo columnTypeInfo = nonDefaultTypeColumns.get(columnName);
367-
final ColumnTypeInfo.SpecialType specialType =
368-
columnTypeInfo == null ? null : columnTypeInfo.specialType().orElse(null);
369-
if (specialType != null) {
370-
if (specialType == ColumnTypeInfo.SpecialType.StringSet) {
371-
return Optional.of(StringSet.class);
372-
}
373-
if (specialType != ColumnTypeInfo.SpecialType.Vector) {
374-
throw new UncheckedDeephavenException("Type " + column.getPrimitiveType()
375-
+ " for column " + Arrays.toString(column.getPath())
376-
+ " with unknown or incompatible special type " + specialType);
377-
}
378-
}
379-
return Optional.of(String.class);
364+
// Delegate to the shared helper so STRING and ENUM are resolved identically,
365+
// including any Deephaven-specific SpecialType metadata (StringSet, Vector).
366+
return visitStringLike(currentColumn.getValue(), nonDefaultTypeColumns);
380367
}
381368

382369
@Override
@@ -393,8 +380,10 @@ public Optional<Class<?>> visit(final LogicalTypeAnnotation.ListLogicalTypeAnnot
393380

394381
@Override
395382
public Optional<Class<?>> visit(final LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
396-
errorString.setValue("EnumLogicalType");
397-
return Optional.empty();
383+
// ENUM is physically identical to STRING (UTF-8 encoded BINARY). Delegate to the
384+
// shared helper so any future Deephaven SpecialType metadata on ENUM columns is
385+
// handled consistently with STRING columns.
386+
return visitStringLike(currentColumn.getValue(), nonDefaultTypeColumns);
398387
}
399388

400389
@Override
@@ -501,4 +490,30 @@ public Optional<Class<?>> visit(
501490
}
502491
};
503492
}
493+
494+
/**
495+
* Shared resolution logic for string-like logical types (STRING and ENUM). Both are physically BINARY with UTF-8
496+
* encoding, so they map to the same Java types. Centralising the logic here ensures that any Deephaven-specific
497+
* {@link ColumnTypeInfo.SpecialType} metadata (e.g. {@code StringSet}, {@code Vector}) is respected for both
498+
* annotations consistently, including any new special types added in the future.
499+
*/
500+
private static Optional<Class<?>> visitStringLike(
501+
final ColumnDescriptor column,
502+
final Map<String, ColumnTypeInfo> nonDefaultTypeColumns) {
503+
final String columnName = column.getPath()[0];
504+
final ColumnTypeInfo columnTypeInfo = nonDefaultTypeColumns.get(columnName);
505+
final ColumnTypeInfo.SpecialType specialType =
506+
columnTypeInfo == null ? null : columnTypeInfo.specialType().orElse(null);
507+
if (specialType != null) {
508+
if (specialType == ColumnTypeInfo.SpecialType.StringSet) {
509+
return Optional.of(StringSet.class);
510+
}
511+
if (specialType != ColumnTypeInfo.SpecialType.Vector) {
512+
throw new UncheckedDeephavenException("Type " + column.getPrimitiveType()
513+
+ " for column " + Arrays.toString(column.getPath())
514+
+ " with unknown or incompatible special type " + specialType);
515+
}
516+
}
517+
return Optional.of(String.class);
518+
}
504519
}

extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/MinMaxFromStatistics.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,8 @@ static boolean getMinMaxForStrings(
392392
@NotNull final Consumer<String> maxSetter) {
393393
final PrimitiveType parquetColType = statistics.type();
394394
final LogicalTypeAnnotation logicalType = parquetColType.getLogicalTypeAnnotation();
395-
if (logicalType instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) {
395+
if (logicalType instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation
396+
|| logicalType instanceof LogicalTypeAnnotation.EnumLogicalTypeAnnotation) {
396397
verifyPrimitive(statistics, PrimitiveType.PrimitiveTypeName.BINARY);
397398
final String minString = statistics.minAsString();
398399
final String maxString = statistics.maxAsString();

extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnLocation.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,12 @@ private static class LogicalTypeVisitor<ATTR extends Any>
535535
return Optional.of(ToStringPage.create(pageType, columnChunkReader.getDictionarySupplier()));
536536
}
537537

538+
@Override
539+
public Optional<ToPage<ATTR, ?>> visit(
540+
final LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
541+
return Optional.of(ToStringPage.create(pageType, columnChunkReader.getDictionarySupplier()));
542+
}
543+
538544
@Override
539545
public Optional<ToPage<ATTR, ?>> visit(
540546
final LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) {

extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,12 @@
5050
import io.deephaven.engine.util.file.TrackedFileHandleFactory;
5151
import io.deephaven.parquet.base.BigDecimalParquetBytesCodec;
5252
import io.deephaven.parquet.base.BigIntegerParquetBytesCodec;
53+
import io.deephaven.parquet.base.ColumnWriter;
5354
import io.deephaven.parquet.base.InvalidParquetFileException;
55+
import io.deephaven.parquet.base.NullParquetMetadataFileWriter;
5456
import io.deephaven.parquet.base.NullStatistics;
57+
import io.deephaven.parquet.base.ParquetFileWriter;
58+
import io.deephaven.parquet.base.RowGroupWriter;
5559
import io.deephaven.parquet.base.materializers.ParquetMaterializerUtils;
5660
import io.deephaven.parquet.table.location.ParquetTableLocation;
5761
import io.deephaven.parquet.table.location.ParquetTableLocationKey;
@@ -76,6 +80,7 @@
7680
import org.apache.commons.lang3.mutable.MutableDouble;
7781
import org.apache.commons.lang3.mutable.MutableFloat;
7882
import org.apache.commons.lang3.mutable.MutableObject;
83+
import org.apache.parquet.bytes.HeapByteBufferAllocator;
7984
import org.apache.parquet.column.Encoding;
8085
import org.apache.parquet.column.statistics.DoubleStatistics;
8186
import org.apache.parquet.column.statistics.IntStatistics;
@@ -133,6 +138,7 @@
133138
import static io.deephaven.util.QueryConstants.*;
134139
import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType;
135140
import static org.apache.parquet.schema.LogicalTypeAnnotation.intType;
141+
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
136142
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
137143
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
138144
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
@@ -5002,6 +5008,31 @@ public void testSelectValidateDefinition() {
50025008
assertTrue(result.getDefinition().getColumn("Double").isDirect());
50035009
}
50045010

5011+
@Test
5012+
public void testReadEnumLogicalTypeAsString() throws IOException {
5013+
final MessageType schema = Types.buildMessage()
5014+
.required(BINARY).as(LogicalTypeAnnotation.enumType()).named("status")
5015+
.named("schema");
5016+
final File dest = new File(rootFile, "enum_logical_type.parquet");
5017+
final Binary[] values = {Binary.fromString("RED"), Binary.fromString("GREEN"), Binary.fromString("BLUE")};
5018+
final Statistics<?> stats = Statistics.createStats(schema.getType("status"));
5019+
for (final Binary v : values) {
5020+
stats.updateStats(v);
5021+
}
5022+
// ParquetFileWriter is AutoCloseable, so nest it in its own try-with-resources to
5023+
// guarantee the file is finalised even if an exception is thrown mid-write.
5024+
try (final java.io.OutputStream os = Files.newOutputStream(dest.toPath());
5025+
final ParquetFileWriter fileWriter = new ParquetFileWriter(dest.toURI(), os,
5026+
ParquetInstructions.EMPTY.getTargetPageSize(), new HeapByteBufferAllocator(), schema,
5027+
"UNCOMPRESSED", Collections.emptyMap(), NullParquetMetadataFileWriter.INSTANCE, true)) {
5028+
final RowGroupWriter rowGroupWriter = fileWriter.addRowGroup(values.length);
5029+
try (final ColumnWriter columnWriter = rowGroupWriter.addColumn("status")) {
5030+
columnWriter.addPageNoNulls(values, values.length, stats);
5031+
}
5032+
}
5033+
checkSingleTable(newTable(stringCol("status", "RED", "GREEN", "BLUE")), dest);
5034+
}
5035+
50055036
private void assertTableStatistics(Table inputTable, File dest) {
50065037
// Verify that the columns have the correct statistics.
50075038
final ParquetMetadata metadata =

extensions/parquet/table/src/test/java/io/deephaven/parquet/table/location/MinMaxFromStatisticsTest.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,21 @@ public void stringLogicalStatisticsAreMaterialised() {
648648
max, "zzz");
649649
}
650650

651+
@Test
652+
public void enumLogicalStatisticsAreMaterialised() {
653+
final PrimitiveType colType = Types.required(PrimitiveType.PrimitiveTypeName.BINARY)
654+
.as(LogicalTypeAnnotation.enumType())
655+
.named("enumBinary");
656+
final Statistics<?> stats = buildStats(
657+
colType, "ALPHA".getBytes(StandardCharsets.UTF_8), "ZETA".getBytes(StandardCharsets.UTF_8), 0L);
658+
final MutableObject<String> min = new MutableObject<>();
659+
final MutableObject<String> max = new MutableObject<>();
660+
assertMatches(
661+
MinMaxFromStatistics.getMinMaxForStrings(stats, min::setValue, max::setValue),
662+
min, "ALPHA",
663+
max, "ZETA");
664+
}
665+
651666
@Test
652667
public void binaryWithoutStringLogicalTypeIsRejected() {
653668
final PrimitiveType colType = Types.required(PrimitiveType.PrimitiveTypeName.BINARY).named("rawBinary");

0 commit comments

Comments
 (0)