Skip to content

Commit 794cf4d

Browse files
authored
Merge pull request #57 from mormamn/mor/unknown-field-validation
fix: reject insertAll requests with unknown fields in row data
2 parents a9f75a0 + 37961e3 commit 794cf4d

File tree

5 files changed

+475
-5
lines changed

5 files changed

+475
-5
lines changed

server/handler.go

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"encoding/csv"
88
"errors"
99
"fmt"
10-
"github.com/goccy/bigquery-emulator/internal/contentdata"
1110
"html"
1211
"io"
1312
"mime"
@@ -21,6 +20,8 @@ import (
2120
"sync"
2221
"time"
2322

23+
"github.com/goccy/bigquery-emulator/internal/contentdata"
24+
2425
"cloud.google.com/go/storage"
2526
"github.com/goccy/go-json"
2627
"github.com/goccy/go-zetasqlite"
@@ -3045,6 +3046,13 @@ func (h *tabledataInsertAllHandler) Handle(ctx context.Context, r *tabledataInse
30453046
}
30463047
data = append(data, rowData)
30473048
}
3049+
3050+
// Validate data against schema before insert
3051+
validationErrors := types.ValidateDataAgainstSchema(content.Schema, data)
3052+
if len(validationErrors) > 0 {
3053+
return buildInsertErrorsResponse(validationErrors, len(data)), nil
3054+
}
3055+
30483056
tableDef, err := types.NewTableWithSchema(content, data)
30493057
if err != nil {
30503058
return nil, err
@@ -3064,6 +3072,50 @@ func (h *tabledataInsertAllHandler) Handle(ctx context.Context, r *tabledataInse
30643072
return &bigqueryv2.TableDataInsertAllResponse{}, nil
30653073
}
30663074

3075+
// buildInsertErrorsResponse builds a BigQuery-compatible error response for insertAll validation errors.
3076+
// When any row has an unknown field, the entire batch is rejected:
3077+
// - Invalid rows get an "invalid" error with the unknown field name
3078+
// - Valid rows get a "stopped" error (they weren't processed because the batch failed)
3079+
func buildInsertErrorsResponse(errors []types.FieldValidationError, totalRows int) *bigqueryv2.TableDataInsertAllResponse {
3080+
invalidRows := make(map[int]bool)
3081+
var insertErrors []*bigqueryv2.TableDataInsertAllResponseInsertErrors
3082+
3083+
// Add "invalid" errors for rows with unknown fields
3084+
for _, e := range errors {
3085+
invalidRows[e.RowIndex] = true
3086+
insertErrors = append(insertErrors, &bigqueryv2.TableDataInsertAllResponseInsertErrors{
3087+
Index: int64(e.RowIndex),
3088+
Errors: []*bigqueryv2.ErrorProto{{
3089+
DebugInfo: "",
3090+
Reason: "invalid",
3091+
Location: e.FieldName,
3092+
Message: fmt.Sprintf("no such field: %s.", e.FieldName),
3093+
ForceSendFields: []string{"DebugInfo"}, // Ensure DebugInfo is included even when empty
3094+
}},
3095+
ForceSendFields: []string{"Index"}, // Ensure Index is included even when 0
3096+
})
3097+
}
3098+
3099+
// Add "stopped" errors for valid rows (they weren't inserted because batch failed)
3100+
for i := 0; i < totalRows; i++ {
3101+
if !invalidRows[i] {
3102+
insertErrors = append(insertErrors, &bigqueryv2.TableDataInsertAllResponseInsertErrors{
3103+
Index: int64(i),
3104+
ForceSendFields: []string{"Index"}, // Ensure Index is included even when 0
3105+
Errors: []*bigqueryv2.ErrorProto{{
3106+
Reason: "stopped",
3107+
Location: "",
3108+
DebugInfo: "",
3109+
Message: "",
3110+
ForceSendFields: []string{"Location", "DebugInfo", "Message"}, // Ensure all empty fields are included
3111+
}},
3112+
})
3113+
}
3114+
}
3115+
3116+
return &bigqueryv2.TableDataInsertAllResponse{InsertErrors: insertErrors}
3117+
}
3118+
30673119
func (h *tabledataListHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
30683120
ctx := r.Context()
30693121
server := serverFromContext(ctx)

test/python/emulator_test.py

Lines changed: 233 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Tests capabilities of the BigQuery emulator."""
2+
23
import base64
34
import datetime
45
from datetime import date
@@ -1254,12 +1255,240 @@ def test_bytes_field_base64_encoding(self) -> None:
12541255
{
12551256
"id": 2,
12561257
"binary_data": binary_bytes,
1257-
"explicit_base64": binary_base64
1258+
"explicit_base64": binary_base64,
12581259
},
1260+
{"id": 3, "binary_data": empty_bytes, "explicit_base64": empty_base64},
1261+
],
1262+
)
1263+
1264+
def test_insert_unknown_fields_valid_row(self) -> None:
1265+
"""
1266+
Tests resolution of https://github.com/goccy/bigquery-emulator/issues/421
1267+
"""
1268+
address = BigQueryAddress(dataset_id=_DATASET_1, table_id=_TABLE_1)
1269+
self.create_mock_table(
1270+
address,
1271+
schema=[
1272+
bigquery.SchemaField(
1273+
"name",
1274+
field_type=bigquery.enums.SqlTypeNames.STRING.value,
1275+
mode="REQUIRED",
1276+
),
1277+
bigquery.SchemaField(
1278+
"age",
1279+
field_type=bigquery.enums.SqlTypeNames.INTEGER.value,
1280+
mode="NULLABLE",
1281+
),
1282+
],
1283+
)
1284+
1285+
table = self.client.get_table(self._table_ref_for_address(address))
1286+
valid_rows = [{"name": "Alice", "age": 30}]
1287+
errors = self.client.insert_rows_json(table, valid_rows)
1288+
1289+
self.assertEqual(errors, [])
1290+
1291+
def test_insert_unknown_fields_one_bad_field(self) -> None:
1292+
"""
1293+
Tests resolution of https://github.com/goccy/bigquery-emulator/issues/421
1294+
Test that inserting a row with one unknown field returns an error with the field name.
1295+
"""
1296+
address = BigQueryAddress(dataset_id=_DATASET_1, table_id=_TABLE_1)
1297+
self.create_mock_table(
1298+
address,
1299+
schema=[
1300+
bigquery.SchemaField(
1301+
"name",
1302+
field_type=bigquery.enums.SqlTypeNames.STRING.value,
1303+
mode="REQUIRED",
1304+
),
1305+
bigquery.SchemaField(
1306+
"age",
1307+
field_type=bigquery.enums.SqlTypeNames.INTEGER.value,
1308+
mode="NULLABLE",
1309+
),
1310+
],
1311+
)
1312+
1313+
table = self.client.get_table(self._table_ref_for_address(address))
1314+
bad_rows = [{"name": "Bob", "age": 25, "unknown_field": "value"}]
1315+
errors = self.client.insert_rows_json(table, bad_rows)
1316+
1317+
self.assertEqual(
1318+
errors,
1319+
[
12591320
{
1260-
"id": 3,
1261-
"binary_data": empty_bytes,
1262-
"explicit_base64": empty_base64
1321+
"index": 0,
1322+
"errors": [
1323+
{
1324+
"reason": "invalid",
1325+
"location": "unknown_field",
1326+
"debugInfo": "",
1327+
"message": "no such field: unknown_field.",
1328+
}
1329+
],
1330+
}
1331+
],
1332+
)
1333+
1334+
def test_insert_unknown_fields_multiple_bad_fields(self) -> None:
1335+
"""
1336+
Tests resolution of https://github.com/goccy/bigquery-emulator/issues/421
1337+
Test that inserting a row with multiple unknown fields returns an error with one field.
1338+
"""
1339+
address = BigQueryAddress(dataset_id=_DATASET_1, table_id=_TABLE_1)
1340+
self.create_mock_table(
1341+
address,
1342+
schema=[
1343+
bigquery.SchemaField(
1344+
"name",
1345+
field_type=bigquery.enums.SqlTypeNames.STRING.value,
1346+
mode="REQUIRED",
1347+
),
1348+
bigquery.SchemaField(
1349+
"age",
1350+
field_type=bigquery.enums.SqlTypeNames.INTEGER.value,
1351+
mode="NULLABLE",
1352+
),
1353+
],
1354+
)
1355+
1356+
table = self.client.get_table(self._table_ref_for_address(address))
1357+
bad_rows = [
1358+
{"name": "Charlie", "unknown1": "a", "unknown2": "b", "unknown3": "c"}
1359+
]
1360+
errors = self.client.insert_rows_json(table, bad_rows)
1361+
1362+
self.assertEqual(len(errors), 1)
1363+
self.assertEqual(errors[0]["index"], 0)
1364+
self.assertEqual(len(errors[0]["errors"]), 1)
1365+
self.assertEqual(errors[0]["errors"][0]["reason"], "invalid")
1366+
# One of the unknown fields should be reported
1367+
self.assertIn(
1368+
errors[0]["errors"][0]["location"], ["unknown1", "unknown2", "unknown3"]
1369+
)
1370+
self.assertIn("no such field:", errors[0]["errors"][0]["message"])
1371+
1372+
def test_insert_unknown_fields_multiple_bad_rows(self) -> None:
1373+
"""
1374+
Tests resolution of https://github.com/goccy/bigquery-emulator/issues/421
1375+
Test that inserting multiple bad rows returns errors for all of them.
1376+
"""
1377+
address = BigQueryAddress(dataset_id=_DATASET_1, table_id=_TABLE_1)
1378+
self.create_mock_table(
1379+
address,
1380+
schema=[
1381+
bigquery.SchemaField(
1382+
"name",
1383+
field_type=bigquery.enums.SqlTypeNames.STRING.value,
1384+
mode="REQUIRED",
1385+
),
1386+
bigquery.SchemaField(
1387+
"age",
1388+
field_type=bigquery.enums.SqlTypeNames.INTEGER.value,
1389+
mode="NULLABLE",
1390+
),
1391+
],
1392+
)
1393+
1394+
table = self.client.get_table(self._table_ref_for_address(address))
1395+
bad_rows = [
1396+
{"name": "Invalid1", "bad_field1": "x"},
1397+
{"name": "Invalid2", "bad_field2": "y"},
1398+
]
1399+
errors = self.client.insert_rows_json(table, bad_rows)
1400+
1401+
# Both rows should have errors
1402+
self.assertEqual(len(errors), 2)
1403+
1404+
error_indices = {e["index"] for e in errors}
1405+
self.assertEqual(error_indices, {0, 1})
1406+
1407+
for error in errors:
1408+
self.assertEqual(len(error["errors"]), 1)
1409+
self.assertEqual(error["errors"][0]["reason"], "invalid")
1410+
self.assertIn("no such field:", error["errors"][0]["message"])
1411+
1412+
def test_insert_unknown_fields_mixed_valid_and_invalid(self) -> None:
1413+
"""
1414+
Tests resolution of https://github.com/goccy/bigquery-emulator/issues/421
1415+
1416+
Test inserting mix of valid and invalid rows.
1417+
Invalid rows should have 'invalid' errors with field location.
1418+
Valid rows should have 'stopped' errors when other rows fail.
1419+
"""
1420+
address = BigQueryAddress(dataset_id=_DATASET_1, table_id=_TABLE_1)
1421+
self.create_mock_table(
1422+
address,
1423+
schema=[
1424+
bigquery.SchemaField(
1425+
"name",
1426+
field_type=bigquery.enums.SqlTypeNames.STRING.value,
1427+
mode="REQUIRED",
1428+
),
1429+
bigquery.SchemaField(
1430+
"age",
1431+
field_type=bigquery.enums.SqlTypeNames.INTEGER.value,
1432+
mode="NULLABLE",
1433+
),
1434+
],
1435+
)
1436+
1437+
table = self.client.get_table(self._table_ref_for_address(address))
1438+
mixed_rows = [
1439+
{"name": "Valid1", "age": 20}, # row 0: valid
1440+
{"name": "Invalid1", "bad_field": "x"}, # row 1: invalid
1441+
{"name": "Valid2", "age": 30}, # row 2: valid
1442+
{"name": "Invalid2", "bad1": "a", "bad2": "b"}, # row 3: invalid
1443+
]
1444+
errors = self.client.insert_rows_json(table, mixed_rows)
1445+
1446+
self.assertEqual(
1447+
errors,
1448+
[
1449+
{
1450+
"index": 1,
1451+
"errors": [
1452+
{
1453+
"reason": "invalid",
1454+
"location": "bad_field",
1455+
"debugInfo": "",
1456+
"message": "no such field: bad_field.",
1457+
}
1458+
],
1459+
},
1460+
{
1461+
"index": 3,
1462+
"errors": [
1463+
{
1464+
"reason": "invalid",
1465+
"location": "bad1",
1466+
"debugInfo": "",
1467+
"message": "no such field: bad1.",
1468+
}
1469+
],
1470+
},
1471+
{
1472+
"index": 0,
1473+
"errors": [
1474+
{
1475+
"reason": "stopped",
1476+
"location": "",
1477+
"debugInfo": "",
1478+
"message": "",
1479+
}
1480+
],
1481+
},
1482+
{
1483+
"index": 2,
1484+
"errors": [
1485+
{
1486+
"reason": "stopped",
1487+
"location": "",
1488+
"debugInfo": "",
1489+
"message": "",
1490+
}
1491+
],
12631492
},
12641493
],
12651494
)

types/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,7 @@ func NewTableWithSchema(t *bigqueryv2.Table, data Data) (*Table, error) {
486486
for k, v := range row {
487487
field, exists := nameToFieldMap[k]
488488
if !exists {
489+
// Skip unknown fields - validation should be done by the caller
489490
continue
490491
}
491492
v, err := normalizeData(v, field)

0 commit comments

Comments
 (0)