feat(cli): add --schema-from-file flag to infer table schema from parquet files (#927)

tanmayrauth · web-flow · commit a1d883edd53d · 2026-04-23T12:47:46.000-04:00
Added a new --schema-from-file flag to `iceberg create table` that
infers the table schema from a local data file. Currently supports
Parquet (.parquet, .parq), with the dispatch designed to support
additional formats (e.g. Avro, ORC) in the future.
When a Parquet file contains Iceberg field IDs, they are preserved.
Files without field IDs are assigned fresh sequential IDs. The flag is
mutually exclusive with --schema, and the inferred schema is printed
before table creation for user verification.
diff --git a/cmd/iceberg/main.go b/cmd/iceberg/main.go
@@ -23,8 +23,12 @@ import (
 	"fmt"
 	"log"
 	"os"
+	"path/filepath"
 	"strings"
 
+	"github.com/apache/arrow-go/v18/arrow/memory"
+	"github.com/apache/arrow-go/v18/parquet/file"
+	"github.com/apache/arrow-go/v18/parquet/pqarrow"
 	"github.com/apache/iceberg-go"
 	"github.com/apache/iceberg-go/catalog"
 	"github.com/apache/iceberg-go/catalog/glue"
@@ -90,6 +94,8 @@ Options:
   --location-uri TEXT  	specify a location URI for the namespace
   --schema JSON        	specify table schema in json (for create table use only)
                        	Ex: [{"name":"id","type":"int","required":false,"doc":"unique id"}]
+  --infer-schema FILE  infer table schema from a local data file (for create table use only)
+                       	Supported formats: parquet
   --properties TEXT 	specify table properties in key=value format (for create table use only)
 						Ex:"format-version=2,write.format.default=parquet"
   --partition-spec TEXT specify partition spec as comma-separated field names(for create table use only)
@@ -144,6 +150,7 @@ type Config struct {
 	Description     string `docopt:"--description"`
 	LocationURI     string `docopt:"--location-uri"`
 	SchemaStr       string `docopt:"--schema"`
+	InferSchema     string `docopt:"--infer-schema"`
 	TableProps      string `docopt:"--properties"`
 	PartitionSpec   string `docopt:"--partition-spec"`
 	SortOrder       string `docopt:"--sort-order"`
@@ -322,14 +329,35 @@ func main() {
 			}
 			output.Text("Namespace " + cfg.Ident + " created successfully")
 		case cfg.Table:
-			if cfg.SchemaStr == "" {
-				output.Error(errors.New("missing --schema for table creation"))
+			if cfg.SchemaStr != "" && cfg.InferSchema != "" {
+				output.Error(errors.New("--schema and --infer-schema are mutually exclusive"))
 				os.Exit(1)
 			}
 
-			schema, err := iceberg.NewSchemaFromJsonFields(0, cfg.SchemaStr)
-			if err != nil {
-				output.Error(err)
+			var schema *iceberg.Schema
+
+			switch {
+			case cfg.SchemaStr != "":
+				var err error
+
+				schema, err = iceberg.NewSchemaFromJsonFields(0, cfg.SchemaStr)
+				if err != nil {
+					output.Error(err)
+					os.Exit(1)
+				}
+			case cfg.InferSchema != "":
+				var err error
+
+				schema, err = schemaFromFile(cfg.InferSchema)
+				if err != nil {
+					output.Error(err)
+					os.Exit(1)
+				}
+
+				output.Text("Inferred schema from " + cfg.InferSchema + ":")
+				output.Schema(schema)
+			default:
+				output.Error(errors.New("missing --schema or --infer-schema for table creation"))
 				os.Exit(1)
 			}
 
@@ -569,3 +597,53 @@ func mergeConf(fileConf *config.CatalogConfig, resConfig *Config, explicitFlags
 		resConfig.RestOptions = fileConf.RestOptions
 	}
 }
+
+func schemaFromFile(path string) (*iceberg.Schema, error) {
+	ext := strings.ToLower(filepath.Ext(path))
+
+	switch ext {
+	case ".parquet", ".parq":
+		return schemaFromParquetFile(path)
+	default:
+		return nil, fmt.Errorf("unsupported file format %s for %s: only .parquet and .parq files are supported", ext, path)
+	}
+}
+
+func schemaFromParquetFile(path string) (*iceberg.Schema, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open file: %w", err)
+	}
+
+	pqReader, err := file.NewParquetReader(f)
+	if err != nil {
+		f.Close()
+
+		return nil, fmt.Errorf("failed to read parquet file: %w", err)
+	}
+	defer pqReader.Close() // also closes underlying file
+
+	arrowReader, err := pqarrow.NewFileReader(pqReader, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read parquet schema: %w", err)
+	}
+
+	arrowSchema, err := arrowReader.Schema()
+	if err != nil {
+		return nil, fmt.Errorf("failed to get arrow schema: %w", err)
+	}
+
+	// Prefer existing field IDs from the Parquet file (written by Iceberg-aware
+	// tools like Spark or PyIceberg). Fall back to fresh sequential IDs only
+	// when the error is specifically about missing field IDs.
+	schema, err := table.ArrowSchemaToIceberg(arrowSchema, true, nil)
+	if err != nil {
+		if errors.Is(err, iceberg.ErrInvalidSchema) && strings.Contains(err.Error(), "field-id") {
+			return table.ArrowSchemaToIcebergWithFreshIDs(arrowSchema, true)
+		}
+
+		return nil, fmt.Errorf("failed to convert parquet schema to iceberg: %w", err)
+	}
+
+	return schema, nil
+}