|
1 | 1 | package types |
2 | 2 |
|
3 | 3 | import ( |
| 4 | + "cloud.google.com/go/bigquery" |
4 | 5 | "encoding/base64" |
| 6 | + "encoding/binary" |
5 | 7 | "fmt" |
6 | | - "math/big" |
7 | | - "strconv" |
8 | | - "strings" |
9 | | - "time" |
10 | | - |
11 | 8 | "github.com/apache/arrow-go/v18/arrow" |
12 | 9 | "github.com/apache/arrow-go/v18/arrow/array" |
13 | 10 | "github.com/apache/arrow-go/v18/arrow/decimal128" |
14 | 11 | "github.com/apache/arrow-go/v18/arrow/decimal256" |
15 | 12 | "github.com/goccy/go-zetasqlite" |
16 | 13 | bigqueryv2 "google.golang.org/api/bigquery/v2" |
| 14 | + "math/big" |
| 15 | + "strconv" |
| 16 | + "strings" |
17 | 17 | ) |
18 | 18 |
|
19 | 19 | func TableToARROW(t *bigqueryv2.Table) (*arrow.Schema, error) { |
@@ -59,7 +59,7 @@ func tableFieldToARROW(f *bigqueryv2.TableFieldSchema) (*arrow.Field, error) { |
59 | 59 | case FieldBytes: |
60 | 60 | return &arrow.Field{Name: f.Name, Type: arrow.BinaryTypes.Binary}, nil |
61 | 61 | case FieldDate: |
62 | | - return &arrow.Field{Name: f.Name, Type: arrow.PrimitiveTypes.Date32}, nil |
| 62 | + return &arrow.Field{Name: f.Name, Type: arrow.PrimitiveTypes.Date64}, nil |
63 | 63 | case FieldDatetime: |
64 | 64 | return &arrow.Field{ |
65 | 65 | Name: f.Name, |
@@ -96,11 +96,21 @@ func tableFieldToARROW(f *bigqueryv2.TableFieldSchema) (*arrow.Field, error) { |
96 | 96 | return &arrow.Field{Name: f.Name, Type: arrow.StructOf(fields...)}, nil |
97 | 97 | case FieldNumeric: |
98 | 98 | // NUMERIC is a DECIMAL with precision 38, scale 9 |
99 | | - return &arrow.Field{Name: f.Name, Type: &arrow.Decimal128Type{Precision: 38, Scale: 9}}, nil |
| 99 | + return &arrow.Field{Name: f.Name, Type: &arrow.Decimal128Type{ |
| 100 | + Precision: bigquery.NumericPrecisionDigits, |
| 101 | + Scale: bigquery.NumericScaleDigits, |
| 102 | + }}, nil |
100 | 103 | case FieldBignumeric: |
101 | | - // BIGNUMERIC is a DECIMAL with precision 76, scale 38 |
102 | | - // BigQuery supports 76.76 digits (76 full digits, 77th is partial) |
103 | | - return &arrow.Field{Name: f.Name, Type: &arrow.Decimal256Type{Precision: 76, Scale: 38}}, nil |
| 104 | + // In BigQuery, BIGNUMERIC is a DECIMAL with precision 76 (partial 77), scale 38 |
| 105 | + // Values requiring 77 digits when scaled by 10^38 work fine, including the maximum value (±2^255 / 10^38). |
| 106 | + // These values can technically be encoded into the Arrow format, but most libraries (including arrow-go) |
| 107 | + // raise validation errors when trying to build them. |
| 108 | + // The values returned by the BigQuery Storage Read API raise errors when you try to validate them client side |
| 109 | + // but if you only access their values, it is fine. |
| 110 | + return &arrow.Field{Name: f.Name, Type: &arrow.Decimal256Type{ |
| 111 | + Precision: bigquery.BigNumericPrecisionDigits, // 76 |
| 112 | + Scale: bigquery.BigNumericScaleDigits, // 38 |
| 113 | + }}, nil |
104 | 114 | case FieldGeography: |
105 | 115 | return &arrow.Field{Name: f.Name, Type: arrow.BinaryTypes.String}, nil |
106 | 116 | case FieldInterval: |
@@ -148,12 +158,12 @@ func AppendValueToARROWBuilder(ptrv *string, builder array.Builder) error { |
148 | 158 | } |
149 | 159 | b.Append(decoded) |
150 | 160 | return nil |
151 | | - case *array.Date32Builder: |
| 161 | + case *array.Date64Builder: |
152 | 162 | t, err := parseDate(v) |
153 | 163 | if err != nil { |
154 | 164 | return err |
155 | 165 | } |
156 | | - b.Append(arrow.Date32(int32(t.Sub(time.Unix(0, 0)) / (24 * time.Hour)))) |
| 166 | + b.Append(arrow.Date64FromTime(t)) |
157 | 167 | return nil |
158 | 168 | case *array.Time64Builder: |
159 | 169 | t, err := parseTime(v) |
@@ -198,32 +208,83 @@ func AppendValueToARROWBuilder(ptrv *string, builder array.Builder) error { |
198 | 208 | // Convert to integer (this truncates any remaining fractional part) |
199 | 209 | scaledInt := new(big.Int).Div(scaled.Num(), scaled.Denom()) |
200 | 210 |
|
201 | | - // Convert to decimal128.Num |
202 | | - num := decimal128.FromBigInt(scaledInt) |
203 | | - b.Append(num) |
| 211 | + // Convert to decimal128.Num using Arrow's FromBigInt (handles two's complement correctly) |
| 212 | + b.Append(decimal128.FromBigInt(scaledInt)) |
204 | 213 | return nil |
205 | 214 | case *array.Decimal256Builder: |
206 | | - // BIGNUMERIC type: precision 77, scale 38 |
207 | | - // Parse the string value to a big.Rat, then convert to scaled integer |
| 215 | + // BIGNUMERIC type: precision 76, scale 38 |
| 216 | + // NOTE: BigQuery declares decimal256(76, 38) in the schema but doesn't enforce |
| 217 | + // precision during encoding. Values requiring 77 digits when scaled work fine in BigQuery. |
| 218 | + // We bypass Arrow's FromBigInt validation (bitlen > 255 check) and manually construct |
| 219 | + // the Decimal256 using the same logic but without the strict check. |
| 220 | + |
| 221 | + // Parse as rational number |
208 | 222 | rat := new(big.Rat) |
209 | 223 | if _, ok := rat.SetString(v); !ok { |
210 | | - return fmt.Errorf("failed to parse decimal value: %s", v) |
| 224 | + return fmt.Errorf("failed to parse BIGNUMERIC value: %s", v) |
211 | 225 | } |
212 | 226 |
|
213 | | - // Scale the value by 10^scale to get the integer representation |
214 | | - scale := int32(38) |
215 | | - scaleFactor := new(big.Int).Exp(big.NewInt(10), big.NewInt(int64(scale)), nil) |
216 | | - |
217 | | - // Multiply the rational by the scale factor |
| 227 | + // Scale by 10^38 to get integer representation |
| 228 | + scale := int64(bigquery.BigNumericScaleDigits) // 38 |
| 229 | + scaleFactor := new(big.Int).Exp(big.NewInt(10), big.NewInt(scale), nil) |
218 | 230 | scaled := new(big.Rat).Mul(rat, new(big.Rat).SetInt(scaleFactor)) |
219 | 231 |
|
220 | | - // Convert to integer (this truncates any remaining fractional part) |
| 232 | + // Convert to integer (truncating any remaining fractional part) |
221 | 233 | scaledInt := new(big.Int).Div(scaled.Num(), scaled.Denom()) |
222 | 234 |
|
223 | | - // Convert to decimal256.Num |
224 | | - num := decimal256.FromBigInt(scaledInt) |
| 235 | + // Replicate decimal256.FromBigInt logic without the bitlen > 255 check |
| 236 | + // This matches how Arrow handles two's complement representation |
| 237 | + var num decimal256.Num |
| 238 | + if scaledInt.Sign() == 0 { |
| 239 | + // Zero value, return default |
| 240 | + b.Append(num) |
| 241 | + return nil |
| 242 | + } |
| 243 | + |
| 244 | + num = decimal256FromScaledInt(scaledInt) |
| 245 | + |
225 | 246 | b.Append(num) |
226 | 247 | return nil |
227 | 248 | } |
228 | 249 | return fmt.Errorf("unexpected builder type %T", builder) |
229 | 250 | } |
| 251 | + |
| 252 | +func decimal256FromScaledInt(scaledInt *big.Int) decimal256.Num { |
| 253 | + b := scaledInt.FillBytes(make([]byte, 32)) |
| 254 | + |
| 255 | + var limbs [4]uint64 |
| 256 | + |
| 257 | + // Arrow Decimal256 uses little-endian uint64 limbs. |
| 258 | + // BigQuery and BigInt.FillBytes produce big-endian bytes. |
| 259 | + // |
| 260 | + // So the 256-bit structure: |
| 261 | + // b[0] ... b[31] (big endian) |
| 262 | + // maps to Arrow limbs: |
| 263 | + // limbs[0] = low 64 bits |
| 264 | + // limbs[3] = high 64 bits |
| 265 | + |
| 266 | + for i := 0; i < 4; i++ { |
| 267 | + // Big-endian slice for limb i: |
| 268 | + start := 32 - (i+1)*8 |
| 269 | + end := 32 - i*8 |
| 270 | + |
| 271 | + // Convert this BE 8-byte block into LE uint64 |
| 272 | + // Arrow stores each limb as native endian (LE) |
| 273 | + limbs[i] = binary.LittleEndian.Uint64(reverse8(b[start:end])) |
| 274 | + } |
| 275 | + |
| 276 | + dec := decimal256.New(limbs[3], limbs[2], limbs[1], limbs[0]) |
| 277 | + // If negative, negate to get two's complement representation |
| 278 | + if scaledInt.Sign() < 0 { |
| 279 | + dec = dec.Negate() |
| 280 | + } |
| 281 | + return dec |
| 282 | +} |
| 283 | + |
| 284 | +func reverse8(b []byte) []byte { |
| 285 | + out := make([]byte, 8) |
| 286 | + for i := 0; i < 8; i++ { |
| 287 | + out[i] = b[7-i] |
| 288 | + } |
| 289 | + return out |
| 290 | +} |
0 commit comments