Skip to content

Commit 431a583

Browse files
authored
Merge pull request #50 from Recidiviz/dan/arrow-extreme
fix: Arrow serialization of extreme values
2 parents 2422b22 + 7ec8daf commit 431a583

File tree

4 files changed

+607
-39
lines changed

4 files changed

+607
-39
lines changed

server/storage_test.go

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,7 +1098,7 @@ func generateExampleMessages(numMessages int) ([][]byte, error) {
10981098
// NUMERIC and BIGNUMERIC can be passed as string, or more efficiently
10991099
// using a packed byte representation.
11001100
NumericCol: proto.String("99999999999999999999999999999.999999999"),
1101-
BignumericCol: proto.String("578960446186580977117854925043439539266.34992332820282019728792003956564819967"),
1101+
BignumericCol: proto.String("-578960446186580977117854925043439539266.34992332820282019728792003956564819968"),
11021102

11031103
// TIME also uses literal format.
11041104
TimeCol: proto.String("12:13:14.000000"),
@@ -1394,18 +1394,17 @@ func TestStorageReadWithAPICreatedTable(t *testing.T) {
13941394

13951395
testData := []TestRow{
13961396
{
1397-
StringCol: "hello",
1398-
IntCol: 42,
1399-
FloatCol: 3.14,
1400-
BoolCol: true,
1401-
BytesCol: []byte("abc"),
1402-
DateCol: "2024-01-01",
1403-
DatetimeCol: "2024-01-01T12:00:00",
1404-
TimestampCol: time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC),
1405-
TimeCol: civil.Time{Hour: 12},
1406-
NumericCol: "123.456",
1407-
// BIGNUMERIC max: 38 integer digits, 38 fractional digits
1408-
BignumericCol: "12345678901234567890123456789012345678.12345678901234567890123456789012345678",
1397+
StringCol: "hello",
1398+
IntCol: 42,
1399+
FloatCol: 3.14,
1400+
BoolCol: true,
1401+
BytesCol: []byte("abc"),
1402+
DateCol: "2024-01-01",
1403+
DatetimeCol: "2024-01-01T12:00:00",
1404+
TimestampCol: time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC),
1405+
TimeCol: civil.Time{Hour: 12},
1406+
NumericCol: "123.456",
1407+
BignumericCol: "578960446186580977117854925043439539266.34992332820282019728792003956564819967",
14091408
ArrayCol: []string{"x", "y"},
14101409
StructCol: map[string]interface{}{"field1": int64(1), "field2": "nested"},
14111410
},

test/python/extreme_values_test.py

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
"""Tests for extreme/boundary values in BigQuery types.
2+
3+
This test suite validates that extreme values (min/max) for various BigQuery
4+
data types are correctly handled by both the REST API and Storage Arrow API.
5+
"""
6+
import datetime
7+
from decimal import Decimal
8+
from typing import Any, Dict, Iterable
9+
10+
import grpc
11+
from google.cloud import bigquery, bigquery_storage
12+
from google.cloud.bigquery_storage_v1.services.big_query_read.transports import (
13+
BigQueryReadGrpcTransport,
14+
)
15+
from google.api_core.client_options import ClientOptions
16+
17+
from utils.big_query_emulator_test_case import BigQueryEmulatorTestCase
18+
19+
20+
class TestExtremeValues(BigQueryEmulatorTestCase):
21+
"""Tests for extreme/boundary values in BigQuery types."""
22+
23+
def _run_test_with_both_apis(
24+
self,
25+
query_str: str,
26+
expected_result: Iterable[Dict[str, Any]],
27+
) -> None:
28+
"""Run the same query against both REST API and Storage Arrow API.
29+
30+
Args:
31+
query_str: SQL query to execute
32+
expected_result: Expected results as list of dictionaries
33+
"""
34+
# Test 1: REST API via job.result()
35+
query_job = self.client.query(query=query_str)
36+
rest_contents = list(
37+
{key: row.get(key) for key in row.keys()} for row in query_job.result()
38+
)
39+
self.assertEqual(
40+
list(expected_result),
41+
rest_contents,
42+
f"REST API results don't match. Expected: {expected_result}, Got: {rest_contents}"
43+
)
44+
45+
# Test 2: Storage Arrow API via query_job.to_dataframe()
46+
# Create a fresh query for the Storage API test
47+
query_job = self.client.query(query=query_str)
48+
49+
# Patch the client to use Storage API
50+
host = f"localhost:{self.emulator.grpc_port}"
51+
channel = grpc.insecure_channel(host)
52+
transport = BigQueryReadGrpcTransport(channel=channel, host=host)
53+
bqstorage_client = bigquery_storage.BigQueryReadClient(
54+
transport=transport,
55+
client_options=ClientOptions(api_endpoint=host),
56+
)
57+
58+
# Get DataFrame using Storage API
59+
df = query_job.to_dataframe(bqstorage_client=bqstorage_client)
60+
61+
# Convert DataFrame to same format as expected_result
62+
arrow_contents = df.to_dict(orient='records')
63+
64+
self.assertEqual(
65+
list(expected_result),
66+
arrow_contents,
67+
f"Storage Arrow API results don't match. Expected: {expected_result}, Got: {arrow_contents}"
68+
)
69+
70+
channel.close()
71+
72+
def test_timestamp_min_max(self) -> None:
73+
"""Tests resolution of https://github.com/goccy/go-zetasqlite/issues/132
74+
and https://github.com/goccy/bigquery-emulator/issues/262"""
75+
self._run_test_with_both_apis(
76+
"""SELECT TIMESTAMP '0001-01-01 00:00:00.000000+00' AS min_ts, TIMESTAMP '9999-12-31 23:59:59.999999+00' AS max_ts""",
77+
expected_result=[
78+
{
79+
"min_ts": datetime.datetime(
80+
1, 1, 1, 0, 0, tzinfo=datetime.timezone.utc
81+
),
82+
"max_ts": datetime.datetime(
83+
9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc
84+
),
85+
}
86+
],
87+
)
88+
89+
def test_int64_min_max(self) -> None:
90+
"""Tests extreme INT64 values (64-bit signed integer range)"""
91+
self._run_test_with_both_apis(
92+
"""SELECT -9223372036854775808 AS min_int64, 9223372036854775807 AS max_int64""",
93+
expected_result=[
94+
{
95+
"min_int64": -9223372036854775808,
96+
"max_int64": 9223372036854775807,
97+
}
98+
],
99+
)
100+
101+
def test_numeric_min_max(self) -> None:
102+
"""Tests extreme NUMERIC values (38 digits, 9 after decimal point)"""
103+
self._run_test_with_both_apis(
104+
"""SELECT
105+
NUMERIC '-99999999999999999999999999999.999999999' AS min_numeric,
106+
NUMERIC '99999999999999999999999999999.999999999' AS max_numeric
107+
""",
108+
expected_result=[
109+
{
110+
"min_numeric": Decimal('-99999999999999999999999999999.999999999'),
111+
"max_numeric": Decimal('99999999999999999999999999999.999999999'),
112+
}
113+
],
114+
)
115+
116+
def test_bignumeric_min_max(self) -> None:
117+
"""Tests extreme BIGNUMERIC values"""
118+
self._run_test_with_both_apis(
119+
"""SELECT
120+
BIGNUMERIC '-578960446186580977117854925043439539266.34992332820282019728792003956564819968' AS min_bignumeric,
121+
BIGNUMERIC '578960446186580977117854925043439539266.34992332820282019728792003956564819967' AS max_bignumeric
122+
""",
123+
expected_result=[
124+
{
125+
"min_bignumeric": Decimal('-578960446186580977117854925043439539266.34992332820282019728792003956564819968'),
126+
"max_bignumeric": Decimal('578960446186580977117854925043439539266.34992332820282019728792003956564819967'),
127+
}
128+
],
129+
)
130+
131+
def test_date_min_max(self) -> None:
132+
"""Tests extreme DATE values"""
133+
self._run_test_with_both_apis(
134+
"""SELECT DATE '0001-01-01' AS min_date, DATE '9999-12-31' AS max_date""",
135+
expected_result=[
136+
{
137+
"min_date": datetime.date(1, 1, 1),
138+
"max_date": datetime.date(9999, 12, 31),
139+
}
140+
],
141+
)
142+
143+
def test_datetime_min_max(self) -> None:
144+
"""Tests extreme DATETIME values"""
145+
self._run_test_with_both_apis(
146+
"""SELECT
147+
DATETIME '0001-01-01 00:00:00' AS min_datetime,
148+
DATETIME '9999-12-31 23:59:59.999999' AS max_datetime
149+
""",
150+
expected_result=[
151+
{
152+
"min_datetime": datetime.datetime(1, 1, 1, 0, 0, 0),
153+
"max_datetime": datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
154+
}
155+
],
156+
)
157+
158+
def test_time_min_max(self) -> None:
159+
"""Tests extreme TIME values"""
160+
self._run_test_with_both_apis(
161+
"""SELECT TIME '00:00:00' AS min_time, TIME '23:59:59.999999' AS max_time""",
162+
expected_result=[
163+
{
164+
"min_time": datetime.time(0, 0, 0),
165+
"max_time": datetime.time(23, 59, 59, 999999),
166+
}
167+
],
168+
)

types/arrow.go

Lines changed: 87 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
package types
22

33
import (
4+
"cloud.google.com/go/bigquery"
45
"encoding/base64"
6+
"encoding/binary"
57
"fmt"
6-
"math/big"
7-
"strconv"
8-
"strings"
9-
"time"
10-
118
"github.com/apache/arrow-go/v18/arrow"
129
"github.com/apache/arrow-go/v18/arrow/array"
1310
"github.com/apache/arrow-go/v18/arrow/decimal128"
1411
"github.com/apache/arrow-go/v18/arrow/decimal256"
1512
"github.com/goccy/go-zetasqlite"
1613
bigqueryv2 "google.golang.org/api/bigquery/v2"
14+
"math/big"
15+
"strconv"
16+
"strings"
1717
)
1818

1919
func TableToARROW(t *bigqueryv2.Table) (*arrow.Schema, error) {
@@ -59,7 +59,7 @@ func tableFieldToARROW(f *bigqueryv2.TableFieldSchema) (*arrow.Field, error) {
5959
case FieldBytes:
6060
return &arrow.Field{Name: f.Name, Type: arrow.BinaryTypes.Binary}, nil
6161
case FieldDate:
62-
return &arrow.Field{Name: f.Name, Type: arrow.PrimitiveTypes.Date32}, nil
62+
return &arrow.Field{Name: f.Name, Type: arrow.PrimitiveTypes.Date64}, nil
6363
case FieldDatetime:
6464
return &arrow.Field{
6565
Name: f.Name,
@@ -96,11 +96,21 @@ func tableFieldToARROW(f *bigqueryv2.TableFieldSchema) (*arrow.Field, error) {
9696
return &arrow.Field{Name: f.Name, Type: arrow.StructOf(fields...)}, nil
9797
case FieldNumeric:
9898
// NUMERIC is a DECIMAL with precision 38, scale 9
99-
return &arrow.Field{Name: f.Name, Type: &arrow.Decimal128Type{Precision: 38, Scale: 9}}, nil
99+
return &arrow.Field{Name: f.Name, Type: &arrow.Decimal128Type{
100+
Precision: bigquery.NumericPrecisionDigits,
101+
Scale: bigquery.NumericScaleDigits,
102+
}}, nil
100103
case FieldBignumeric:
101-
// BIGNUMERIC is a DECIMAL with precision 76, scale 38
102-
// BigQuery supports 76.76 digits (76 full digits, 77th is partial)
103-
return &arrow.Field{Name: f.Name, Type: &arrow.Decimal256Type{Precision: 76, Scale: 38}}, nil
104+
// In BigQuery, BIGNUMERIC is a DECIMAL with precision 76 (partial 77), scale 38
105+
// Values requiring 77 digits when scaled by 10^38 work fine, including the maximum value (±2^255 / 10^38).
106+
// These values can technically be encoded into the Arrow format, but most libraries (including arrow-go)
107+
// raise validation errors when trying to build them.
108+
// The values returned by the BigQuery Storage Read API raise errors when you try to validate them client side
109+
// but if you only access their values, it is fine.
110+
return &arrow.Field{Name: f.Name, Type: &arrow.Decimal256Type{
111+
Precision: bigquery.BigNumericPrecisionDigits, // 76
112+
Scale: bigquery.BigNumericScaleDigits, // 38
113+
}}, nil
104114
case FieldGeography:
105115
return &arrow.Field{Name: f.Name, Type: arrow.BinaryTypes.String}, nil
106116
case FieldInterval:
@@ -148,12 +158,12 @@ func AppendValueToARROWBuilder(ptrv *string, builder array.Builder) error {
148158
}
149159
b.Append(decoded)
150160
return nil
151-
case *array.Date32Builder:
161+
case *array.Date64Builder:
152162
t, err := parseDate(v)
153163
if err != nil {
154164
return err
155165
}
156-
b.Append(arrow.Date32(int32(t.Sub(time.Unix(0, 0)) / (24 * time.Hour))))
166+
b.Append(arrow.Date64FromTime(t))
157167
return nil
158168
case *array.Time64Builder:
159169
t, err := parseTime(v)
@@ -198,32 +208,83 @@ func AppendValueToARROWBuilder(ptrv *string, builder array.Builder) error {
198208
// Convert to integer (this truncates any remaining fractional part)
199209
scaledInt := new(big.Int).Div(scaled.Num(), scaled.Denom())
200210

201-
// Convert to decimal128.Num
202-
num := decimal128.FromBigInt(scaledInt)
203-
b.Append(num)
211+
// Convert to decimal128.Num using Arrow's FromBigInt (handles two's complement correctly)
212+
b.Append(decimal128.FromBigInt(scaledInt))
204213
return nil
205214
case *array.Decimal256Builder:
206-
// BIGNUMERIC type: precision 77, scale 38
207-
// Parse the string value to a big.Rat, then convert to scaled integer
215+
// BIGNUMERIC type: precision 76, scale 38
216+
// NOTE: BigQuery declares decimal256(76, 38) in the schema but doesn't enforce
217+
// precision during encoding. Values requiring 77 digits when scaled work fine in BigQuery.
218+
// We bypass Arrow's FromBigInt validation (bitlen > 255 check) and manually construct
219+
// the Decimal256 using the same logic but without the strict check.
220+
221+
// Parse as rational number
208222
rat := new(big.Rat)
209223
if _, ok := rat.SetString(v); !ok {
210-
return fmt.Errorf("failed to parse decimal value: %s", v)
224+
return fmt.Errorf("failed to parse BIGNUMERIC value: %s", v)
211225
}
212226

213-
// Scale the value by 10^scale to get the integer representation
214-
scale := int32(38)
215-
scaleFactor := new(big.Int).Exp(big.NewInt(10), big.NewInt(int64(scale)), nil)
216-
217-
// Multiply the rational by the scale factor
227+
// Scale by 10^38 to get integer representation
228+
scale := int64(bigquery.BigNumericScaleDigits) // 38
229+
scaleFactor := new(big.Int).Exp(big.NewInt(10), big.NewInt(scale), nil)
218230
scaled := new(big.Rat).Mul(rat, new(big.Rat).SetInt(scaleFactor))
219231

220-
// Convert to integer (this truncates any remaining fractional part)
232+
// Convert to integer (truncating any remaining fractional part)
221233
scaledInt := new(big.Int).Div(scaled.Num(), scaled.Denom())
222234

223-
// Convert to decimal256.Num
224-
num := decimal256.FromBigInt(scaledInt)
235+
// Replicate decimal256.FromBigInt logic without the bitlen > 255 check
236+
// This matches how Arrow handles two's complement representation
237+
var num decimal256.Num
238+
if scaledInt.Sign() == 0 {
239+
// Zero value, return default
240+
b.Append(num)
241+
return nil
242+
}
243+
244+
num = decimal256FromScaledInt(scaledInt)
245+
225246
b.Append(num)
226247
return nil
227248
}
228249
return fmt.Errorf("unexpected builder type %T", builder)
229250
}
251+
252+
func decimal256FromScaledInt(scaledInt *big.Int) decimal256.Num {
253+
b := scaledInt.FillBytes(make([]byte, 32))
254+
255+
var limbs [4]uint64
256+
257+
// Arrow Decimal256 uses little-endian uint64 limbs.
258+
// BigQuery and BigInt.FillBytes produce big-endian bytes.
259+
//
260+
// So the 256-bit structure:
261+
// b[0] ... b[31] (big endian)
262+
// maps to Arrow limbs:
263+
// limbs[0] = low 64 bits
264+
// limbs[3] = high 64 bits
265+
266+
for i := 0; i < 4; i++ {
267+
// Big-endian slice for limb i:
268+
start := 32 - (i+1)*8
269+
end := 32 - i*8
270+
271+
// Convert this BE 8-byte block into LE uint64
272+
// Arrow stores each limb as native endian (LE)
273+
limbs[i] = binary.LittleEndian.Uint64(reverse8(b[start:end]))
274+
}
275+
276+
dec := decimal256.New(limbs[3], limbs[2], limbs[1], limbs[0])
277+
// If negative, negate to get two's complement representation
278+
if scaledInt.Sign() < 0 {
279+
dec = dec.Negate()
280+
}
281+
return dec
282+
}
283+
284+
func reverse8(b []byte) []byte {
285+
out := make([]byte, 8)
286+
for i := 0; i < 8; i++ {
287+
out[i] = b[7-i]
288+
}
289+
return out
290+
}

0 commit comments

Comments
 (0)