Skip to content

Commit c576fd1

Browse files
authored
fix(table): seed UpdateSchema id counter from metadata LastColumnID (#936)
## Problem `NewUpdateSchema` seeds its fresh-id counter from `txn.meta.CurrentSchema().HighestFieldID()`. The Iceberg spec reserves `last-column-id` on table metadata as the monotonic counter for this purpose; Java's [`SchemaUpdate`](https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/SchemaUpdate.java) seeds from `metadata.lastColumnId()`. When the current schema's highest id is lower than the lifetime max — after a previous evolution added columns that were later dropped, or after a schema swap — `AddColumn` allocates ids already referenced by fields in historical schemas. Catalogs that index the full schema history (e.g. AWS Glue's `schemasToGlueColumns`) then reject the commit with a duplicate field id error. ## Fix Seed `lastColumnID` from `txn.meta.LastColumnID()` in `NewUpdateSchema`. Adds a public `LastColumnID()` getter on `*MetadataBuilder` — the `Metadata` interface already exposes this value, but `txn.meta` is the builder. `MetadataBuilder.AddSchema` already keeps `last-column-id` monotonic via `max(b.lastColumnId, schema.HighestFieldID())`, so no other allocator sites need changes. ## Testing Adds `TestAddColumnMonotonicFieldIDs`: constructs metadata whose `last-column-id` exceeds the current schema's `HighestFieldID()` (by adding a higher-id schema while leaving the original as current) and asserts that `AddColumn` allocates above `last-column-id`. Confirmed failing on `main` before the fix and passing after. `go test ./...` passes. Fixes #935 Signed-off-by: Ashley Jeffs <ash@jeffail.uk>
1 parent f8df330 commit c576fd1

3 files changed

Lines changed: 60 additions & 3 deletions

File tree

table/metadata.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,10 @@ func (b *MetadataBuilder) CurrentSchema() *iceberg.Schema {
296296

297297
func (b *MetadataBuilder) LastUpdatedMS() int64 { return b.lastUpdatedMS }
298298

299+
// LastColumnID returns the highest field id ever assigned in this table's
300+
// lifetime, as tracked by the Iceberg spec's last-column-id counter.
301+
func (b *MetadataBuilder) LastColumnID() int { return b.lastColumnId }
302+
299303
func (b *MetadataBuilder) nextSequenceNumber() int64 {
300304
if b.formatVersion > 1 {
301305
if b.lastSequenceNumber == nil {

table/update_schema.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,14 @@ func WithNameMapping(nameMapping iceberg.NameMapping) UpdateSchemaOption {
133133
// Returns an UpdateSchema instance that can be used to build and apply schema changes.
134134
func NewUpdateSchema(txn *Transaction, caseSensitive bool, allowIncompatibleChanges bool, opts ...UpdateSchemaOption) *UpdateSchema {
135135
u := &UpdateSchema{
136-
txn: txn,
137-
schema: nil,
138-
lastColumnID: txn.meta.CurrentSchema().HighestFieldID(),
136+
txn: txn,
137+
schema: nil,
138+
// Seed from metadata's last-column-id rather than the current schema's
139+
// highest field id. Per the Iceberg spec, last-column-id is a monotonic
140+
// counter that preserves ids across schema evolution (including
141+
// deletions) so that newly-allocated ids never collide with ids still
142+
// referenced by historical schemas.
143+
lastColumnID: txn.meta.LastColumnID(),
139144

140145
deletes: make(map[int]struct{}),
141146
updates: make(map[int]map[int]iceberg.NestedField),

table/update_schema_test.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -913,3 +913,51 @@ func TestBuildUpdates(t *testing.T) {
913913
assert.Equal(t, 0, updates[0].(*setCurrentSchemaUpdate).SchemaID)
914914
})
915915
}
916+
917+
// TestAddColumnMonotonicFieldIDs exercises the case where the table's
918+
// last-column-id is greater than the current schema's highest field id — for
919+
// example because a previous schema was added that introduced higher ids, or
920+
// because the highest-id columns were later dropped. The Iceberg spec requires
921+
// new field ids to be allocated above last-column-id (never to be reused), so
922+
// AddColumn must seed its id counter from metadata.LastColumnID() rather than
923+
// the current schema's HighestFieldID().
924+
func TestAddColumnMonotonicFieldIDs(t *testing.T) {
925+
// Start from originalSchema (field ids 1..11, schema id 1).
926+
baseMeta, err := NewMetadata(originalSchema, iceberg.UnpartitionedSpec, UnsortedSortOrder, "", nil)
927+
assert.NoError(t, err)
928+
929+
// Add a second schema that introduces higher field ids. This bumps the
930+
// metadata's last-column-id to 13 while the current schema is still the
931+
// original (highest field id 11).
932+
expanded := iceberg.NewSchema(0,
933+
iceberg.NestedField{ID: 1, Name: "id", Type: iceberg.PrimitiveTypes.Int32, Required: true},
934+
iceberg.NestedField{ID: 12, Name: "extra_a", Type: iceberg.PrimitiveTypes.String, Required: false},
935+
iceberg.NestedField{ID: 13, Name: "extra_b", Type: iceberg.PrimitiveTypes.String, Required: false},
936+
)
937+
938+
builder, err := MetadataBuilderFromBase(baseMeta, "")
939+
assert.NoError(t, err)
940+
assert.NoError(t, builder.AddSchema(expanded))
941+
942+
meta, err := builder.Build()
943+
assert.NoError(t, err)
944+
945+
assert.Equal(t, 11, meta.CurrentSchema().HighestFieldID(),
946+
"precondition: current schema should still be the original with highest id 11")
947+
assert.Equal(t, 13, meta.LastColumnID(),
948+
"precondition: last-column-id should have been bumped by the expanded schema")
949+
950+
tbl := New([]string{"id"}, meta, "", nil, nil)
951+
txn := tbl.NewTransaction()
952+
953+
newSchema, err := NewUpdateSchema(txn, true, true).
954+
AddColumn([]string{"fresh"}, iceberg.PrimitiveTypes.String, "", false, nil).
955+
Apply()
956+
assert.NoError(t, err)
957+
assert.NotNil(t, newSchema)
958+
959+
fresh, ok := newSchema.FindFieldByName("fresh")
960+
assert.True(t, ok, "new field should be present in the resulting schema")
961+
assert.Equal(t, 14, fresh.ID,
962+
"new field id must be allocated above metadata.LastColumnID() (13), not reused from the current schema's highest id (11)")
963+
}

0 commit comments

Comments
 (0)