Skip to content

Commit 90a3efa

Browse files
fix: scan files with lines exceeding bufio's default 64 KB token limit (#5022)
* fix: scan files with lines exceeding bufio's default 64 KB token limit * added related test
1 parent 5a0f877 commit 90a3efa

2 files changed

Lines changed: 56 additions & 1 deletion

File tree

pkg/sources/git/git.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -750,7 +750,6 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string
750750

751751
email := commit.Author
752752
when := commit.Date.UTC().Format("2006-01-02 15:04:05 -0700")
753-
754753
if fullHash != lastCommitHash {
755754
depth++
756755
lastCommitHash = fullHash
@@ -901,6 +900,14 @@ func (s *Git) gitChunk(ctx context.Context, diff *gitparse.Diff, fileName, email
901900
defer func() { _ = reader.Close() }()
902901

903902
originalChunk := bufio.NewScanner(reader)
903+
// Default bufio max token size (64 KB) is too small for files with long lines
904+
// (e.g. minified JS, base64 blobs). Raise the cap to 10 MB so those lines
905+
// are still scanned; the oversize-line path below will chunk them correctly.
906+
// The initial buffer starts at 4 KB (same as bufio's default) and grows only
907+
// when a line actually exceeds the current size, keeping allocations cheap for
908+
// the common case of small diffs.
909+
const maxScanTokenSize = 10 * 1024 * 1024
910+
originalChunk.Buffer(make([]byte, 4096), maxScanTokenSize)
904911
newChunkBuffer := bytes.Buffer{}
905912
lastOffset := 0
906913
for offset := 0; originalChunk.Scan(); offset++ {
@@ -967,6 +974,9 @@ func (s *Git) gitChunk(ctx context.Context, diff *gitparse.Diff, fileName, email
967974
ctx.Logger().Error(err, "error writing to chunk buffer", "filename", fileName, "commit", hash, "file", diff.PathB)
968975
}
969976
}
977+
if err := originalChunk.Err(); err != nil {
978+
ctx.Logger().Error(err, "error scanning chunk", "filename", fileName, "commit", hash, "file", diff.PathB)
979+
}
970980
// Send anything still in the new chunk buffer
971981
if newChunkBuffer.Len() > 0 {
972982
metadata := s.sourceMetadataFunc(SourceMetadataInfo{

pkg/sources/git/git_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"path/filepath"
99
"runtime"
1010
"strings"
11+
"sync"
1112
"testing"
1213
"time"
1314

@@ -1163,3 +1164,47 @@ func TestPrepareRepoWithNormalizationBare(t *testing.T) {
11631164
})
11641165
}
11651166
}
1167+
1168+
// TestGitChunk_LongLine verifies that files containing lines longer than
1169+
// bufio's default 64 KB token limit are still scanned. Before the fix,
1170+
// bufio.Scanner would silently stop on the first oversized line and produce
1171+
// zero chunks for that file.
1172+
func TestGitChunk_LongLine(t *testing.T) {
1173+
t.Parallel()
1174+
ctx := context.Background()
1175+
1176+
repoPath := setupTestRepo(t, "long-line-repo")
1177+
1178+
// Build a single line that is 100 KB — well above the old 64 KB cap.
1179+
longLine := strings.Repeat("a", 100*1024)
1180+
addTestFileAndCommit(t, repoPath, "long_line.txt", longLine)
1181+
1182+
conn, err := anypb.New(&sourcespb.Git{
1183+
Credential: &sourcespb.Git_Unauthenticated{
1184+
Unauthenticated: &credentialspb.Unauthenticated{},
1185+
},
1186+
Repositories: []string{"file://" + repoPath},
1187+
})
1188+
assert.NoError(t, err)
1189+
1190+
s := Source{}
1191+
assert.NoError(t, s.Init(ctx, "test long line", 0, 0, false, conn, 1))
1192+
1193+
chunksCh := make(chan *sources.Chunk, 64)
1194+
var count int
1195+
var wg sync.WaitGroup
1196+
wg.Add(1)
1197+
go func() {
1198+
defer wg.Done()
1199+
for range chunksCh {
1200+
count++
1201+
}
1202+
}()
1203+
1204+
assert.NoError(t, s.Chunks(ctx, chunksCh))
1205+
close(chunksCh)
1206+
wg.Wait()
1207+
// ensure the goroutine has finished writing to count before we read it
1208+
// one chunk for the commit/file metadata, and at least one chunk for the file content
1209+
assert.Equal(t, 2, count, "expected two chunks from a file with a 100 KB line")
1210+
}

0 commit comments

Comments
 (0)