@@ -185,7 +185,58 @@ func (w *{{.Name}}ColumnChunkWriter) WriteDictIndices(indices arrow.Array, defLe
185185}
186186
187187func (w *{{.Name }}ColumnChunkWriter) writeValues(values []{{.name }}, numNulls int64) {
188+ {{- if or (eq .Name " ByteArray" ) (eq .Name " FixedLenByteArray" )}}
189+ // For variable-length types, we need to check buffer size to prevent int32 overflow
190+ // For small values (<1MB), checking frequently adds negligible overhead
191+ // For large values (>1MB), we MUST check before each value
192+ const maxSafeBufferSize = 1.0 * 1024 * 1024 * 1024 // 1GB threshold
193+ const largeValueThreshold = 1.0 * 1024 * 1024 // 1MB
194+
195+ encoder := w.currentEncoder. (encoding. {{.Name }}Encoder)
196+ currentSize := w.currentEncoder.EstimatedDataEncodedSize ()
197+
198+ // Batch process small values, check individually for large values
199+ batchStart := 0
200+ for i := 0; i < len (values); i++ {
201+ {{- if eq .Name " ByteArray" }}
202+ valueSize := int64(len (values[i]))
203+ {{- else }}
204+ valueSize := int64(w.descr.TypeLength ())
205+ {{- end }}
206+
207+ // If this value might cause overflow, flush first
208+ if currentSize + valueSize >= maxSafeBufferSize {
209+ // Add accumulated batch before flushing
210+ if i > batchStart {
211+ encoder.Put (values[batchStart:i])
212+ currentSize = w.currentEncoder.EstimatedDataEncodedSize ()
213+ }
214+ // Flush the page
215+ if err := w.FlushCurrentPage (); err != nil {
216+ panic(err)
217+ }
218+ batchStart = i
219+ currentSize = 0
220+ }
221+
222+ // Track size estimate
223+ currentSize += valueSize + 4 // +4 for length prefix
224+
225+ // For large values, add and flush immediately if needed
226+ if valueSize >= largeValueThreshold {
227+ encoder.Put (values[i:i+1])
228+ batchStart = i + 1
229+ currentSize = w.currentEncoder.EstimatedDataEncodedSize ()
230+ }
231+ }
232+
233+ // Add remaining batch
234+ if batchStart < len (values) {
235+ encoder.Put (values[batchStart:])
236+ }
237+ {{- else }}
188238 w.currentEncoder. (encoding. {{.Name }}Encoder).Put (values)
239+ {{- end }}
189240 if w.pageStatistics != nil {
190241{{- if ne .Name " FixedLenByteArray" }}
191242 w.pageStatistics. (*metadata. {{.Name }}Statistics).Update (values, numNulls)
@@ -204,11 +255,54 @@ func (w *{{.Name}}ColumnChunkWriter) writeValues(values []{{.name}}, numNulls in
204255}
205256
206257func (w *{{.Name }}ColumnChunkWriter) writeValuesSpaced(spacedValues []{{.name }}, numRead, numValues int64, validBits []byte, validBitsOffset int64) {
258+ {{- if or (eq .Name " ByteArray" ) (eq .Name " FixedLenByteArray" )}}
259+ // For variable-length types, we need to check buffer size to prevent int32 overflow
260+ // For small values (<1MB), checking frequently adds negligible overhead
261+ // For large values (>1MB), we MUST check before each value
262+ const maxSafeBufferSize = 1.0 * 1024 * 1024 * 1024 // 1GB threshold
263+ const largeValueThreshold = 1.0 * 1024 * 1024 // 1MB
264+
265+ encoder := w.currentEncoder. (encoding. {{.Name }}Encoder)
266+ currentSize := w.currentEncoder.EstimatedDataEncodedSize ()
267+
268+ for i := 0; i < len (spacedValues); i++ {
269+ {{- if eq .Name " ByteArray" }}
270+ valueSize := int64(len (spacedValues[i]))
271+ {{- else }}
272+ valueSize := int64(w.descr.TypeLength ())
273+ {{- end }}
274+
275+ // If this value might cause overflow, flush first
276+ if currentSize + valueSize >= maxSafeBufferSize {
277+ if err := w.FlushCurrentPage (); err != nil {
278+ // If flush fails, panic will be caught by WriteBatch's defer recover
279+ panic(err)
280+ }
281+ currentSize = 0
282+ }
283+
284+ // Add the value
285+ chunk := spacedValues[i:i+1]
286+ if len (spacedValues) != int(numRead) && validBits != nil {
287+ encoder.PutSpaced (chunk, validBits, validBitsOffset+int64(i))
288+ } else {
289+ encoder.Put (chunk)
290+ }
291+
292+ // Track size estimate (only update for large values or every 100 values)
293+ if valueSize >= largeValueThreshold || i % 100 == 0 {
294+ currentSize = w.currentEncoder.EstimatedDataEncodedSize ()
295+ } else {
296+ currentSize += valueSize + 4 // +4 for length prefix
297+ }
298+ }
299+ {{- else }}
207300 if len (spacedValues) != int(numRead) {
208301 w.currentEncoder. (encoding. {{.Name }}Encoder).PutSpaced (spacedValues, validBits, validBitsOffset)
209302 } else {
210303 w.currentEncoder. (encoding. {{.Name }}Encoder).Put (spacedValues)
211304 }
305+ {{- end }}
212306 if w.pageStatistics != nil {
213307 nulls := numValues - numRead
214308{{- if ne .Name " FixedLenByteArray" }}
0 commit comments