Only GC when needed. Reduce allocs from mechanics. Add PrecompileTools workload. (#419)

IanButterworth · github-actions[bot] · web-flow · commit c56236236933 · 2026-04-14T20:38:21.000-04:00
* Reduce overhead in hot benchmark loop

- Make `Benchmark` parametric (`Benchmark{F,Q}`) so `samplefunc` and
  `quote_vals` have concrete types, eliminating dynamic dispatch and
  boxing on every sample call

- Skip `gcscrub()` before `gctrial`/`gcsample` when the previous
  sample (or warmup) reported zero allocations — nothing to collect

- Pre-allocate `Trial` vectors with `sizehint!` based on the first
  real sample time, avoiding repeated heap growth and GC churn from
  the harness itself during the run

- Add a test asserting the harness itself reports zero allocations for
  a zero-allocation benchmark

* add a PrecompileTools workload

* Use function barriers and reduce allocations in hot loops

Revert Benchmark to non-parametric (easier to pass around) and use
function barriers (_run_inner, _lineartrial_inner) so Julia specializes
the sampling loops on concrete samplefunc/quote_vals types without
parameterizing the struct.

- Skip GC scrub when warmup/sample reported zero allocations
- Temporarily set evals=1 for warmup instead of allocating new Parameters
- Use explicit push!(trial, s[1], s[2], s[3], s[4]) instead of
  s[1:(end-1)]... to avoid intermediate tuple allocation
- resize! instead of slice-copy in _lineartrial_inner

Reduces steady-state allocations from ~102K to ~96 per benchmark run.

* add concurrency cancel in progress

* don't capture the returned result if not exposed

* Use Ref{SampleResult} to avoid heap-allocating return tuples

The samplefunc is stored as `Function` (abstract type), so every
call returned a heap-allocated tuple through dynamic dispatch.
With ~10K samples per benchmark, this was ~10K allocs per run.

Now the caller passes a pre-allocated Ref{SampleResult} that the
samplefunc writes into, reducing per-benchmark allocs from ~10K
to ~12 (all structural: Parameters copy, Trial, vectors, etc).

* Update .gitignore

* format

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;

* don't specialize show methods on IO type

---------

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -7,6 +7,9 @@ on:
     branches:
       - main
     tags: '*'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
 jobs:
   test:
     name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ github.event_name }}
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,5 @@ test/x.json
 docs/Manifest.toml
 docs/build
 docs/src/assets/indigo.css
-Manifest.toml
+Manifest.toml
+.DS_Store
diff --git a/Project.toml b/Project.toml
@@ -6,6 +6,7 @@ version = "1.7.0"
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -16,6 +17,7 @@ Aqua = "0.8"
 Compat = "4.11"
 JSON = "0.18, 0.19, 0.20, 0.21, 1.2"
 Logging = "<0.0.1, 1"
+PrecompileTools = "1"
 Printf = "<0.0.1, 1"
 Profile = "<0.0.1, 1"
 Statistics = "<0.0.1, 1"
diff --git a/src/BenchmarkTools.jl b/src/BenchmarkTools.jl
@@ -10,6 +10,8 @@ using Printf
 using Profile
 using Compat
 
+using PrecompileTools: @compile_workload, @setup_workload
+
 ##############
 # Parameters #
 ##############
@@ -80,4 +82,12 @@ export tune!,
 
 include("serialization.jl")
 
+@setup_workload begin
+    @compile_workload begin
+        s = @benchmark 1 + 1
+        io = IOContext(IOBuffer(), :color => true)
+        show(io, MIME("text/plain"), s)
+    end
+end
+
 end # module BenchmarkTools
diff --git a/src/execution.jl b/src/execution.jl
@@ -20,6 +20,8 @@ mutable struct Benchmark
     params::Parameters
 end
 
+const SampleResult = Tuple{Float64,Float64,Int,Int}
+
 params(b::Benchmark) = b.params
 
 function loadparams!(b::Benchmark, params::Parameters, fields...)
@@ -106,24 +108,57 @@ end
 # Note that trials executed via `run` and `lineartrial` are always executed at top-level
 # scope, in order to allow transfer of locally-scoped variables into benchmark scope.
 
-function _run(b::Benchmark, p::Parameters; verbose=false, pad="", warmup=true, kwargs...)
+function _run(
+    b::Benchmark,
+    p::Parameters;
+    verbose=false,
+    pad="",
+    warmup=true,
+    capture_result=true,
+    kwargs...,
+)
     params = Parameters(p; kwargs...)
     @assert params.seconds > 0.0 "time limit must be greater than 0.0"
+    sample_ref = Ref{SampleResult}((0.0, 0.0, 0, 0))
     if warmup
-        b.samplefunc(b.quote_vals, Parameters(params; evals=1)) #warmup sample
+        saved_evals = params.evals
+        params.evals = 1
+        b.samplefunc(b.quote_vals, params, sample_ref, nothing)
+        warmup_allocs = sample_ref[][4]
+        params.evals = saved_evals
+        params.gctrial && warmup_allocs > 0 && gcscrub()
     end
     trial = Trial(params)
-    params.gctrial && gcscrub()
     start_time = Base.time()
-    s = b.samplefunc(b.quote_vals, params)
-    push!(trial, s[1:(end - 1)]...)
-    return_val = s[end]
+    b.samplefunc(b.quote_vals, params, sample_ref, nothing)
+    s = sample_ref[]
+    push!(trial, s[1], s[2], s[3], s[4])
+    sample_time_s = s[1] * params.evals / 1e9
+    estimated_remaining = if sample_time_s > 0
+        min(
+            params.samples - 1,
+            ceil(Int, (params.seconds - (Base.time() - start_time)) / sample_time_s),
+        )
+    else
+        params.samples - 1
+    end
+    sizehint!(trial.times, 1 + estimated_remaining)
+    sizehint!(trial.gctimes, 1 + estimated_remaining)
     iters = 2
     while (Base.time() - start_time) < params.seconds && iters ≤ params.samples
-        params.gcsample && gcscrub()
-        push!(trial, b.samplefunc(b.quote_vals, params)[1:(end - 1)]...)
+        params.gcsample && s[4] > 0 && gcscrub()
+        b.samplefunc(b.quote_vals, params, sample_ref, nothing)
+        s = sample_ref[]
+        push!(trial, s[1], s[2], s[3], s[4])
         iters += 1
     end
+    return_val = if capture_result
+        result_ref = Ref{Any}()
+        b.samplefunc(b.quote_vals, params, sample_ref, result_ref)
+        result_ref[]
+    else
+        nothing
+    end
     return trial, return_val
 end
 
@@ -140,7 +175,7 @@ function Base.run(
     ndone=NaN,
     kwargs...,
 )
-    return run_result(b, p; kwargs...)[1]
+    return run_result(b, p; capture_result=false, kwargs...)[1]
 end
 
 """
@@ -182,18 +217,25 @@ function _lineartrial(b::Benchmark, p::Parameters=b.params; maxevals=RESOLUTION,
     params = Parameters(p; kwargs...)
     estimates = zeros(maxevals)
     completed = 0
+    sample_ref = Ref{SampleResult}((0.0, 0.0, 0, 0))
     params.evals = 1
-    b.samplefunc(b.quote_vals, params) #warmup sample
-    params.gctrial && gcscrub()
+    b.samplefunc(b.quote_vals, params, sample_ref, nothing)
+    warmup_allocs = sample_ref[][4]
+    params.gctrial && warmup_allocs > 0 && gcscrub()
     start_time = time()
+    prev_allocs = warmup_allocs
     for evals in eachindex(estimates)
-        params.gcsample && gcscrub()
+        params.gcsample && prev_allocs > 0 && gcscrub()
         params.evals = evals
-        estimates[evals] = first(b.samplefunc(b.quote_vals, params))
+        b.samplefunc(b.quote_vals, params, sample_ref, nothing)
+        s = sample_ref[]
+        estimates[evals] = s[1]
+        prev_allocs = s[4]
         completed += 1
         ((time() - start_time) > params.seconds) && break
     end
-    return estimates[1:completed]
+    resize!(estimates, completed)
+    return estimates
 end
 
 function warmup(item; verbose::Bool=true)
@@ -605,7 +647,10 @@ function generate_benchmark_definition(
                     $(core_body)
                 end
                 @noinline function $(samplefunc)(
-                    $(Expr(:tuple, quote_vars...)), __params::$BenchmarkTools.Parameters
+                    $(Expr(:tuple, quote_vars...)),
+                    __params::$BenchmarkTools.Parameters,
+                    __sample_ref::Ref{$BenchmarkTools.SampleResult},
+                    __result_ref::Union{Ref{Any},Nothing},
                 )
                     $(setup)
                     __evals = __params.evals
@@ -618,6 +663,9 @@ function generate_benchmark_definition(
                     __sample_time = time_ns() - __start_time
                     __gcdiff = Base.GC_Diff(Base.gc_num(), __gc_start)
                     $(teardown)
+                    if __result_ref !== nothing
+                        __result_ref[] = __return_val
+                    end
                     __time = max((__sample_time / __evals) - __params.overhead, 0.001)
                     __gctime = max((__gcdiff.total_time / __evals) - __params.overhead, 0.0)
                     __memory = Int(Base.fld(__gcdiff.allocd, __evals))
@@ -630,7 +678,8 @@ function generate_benchmark_definition(
                             __evals,
                         ),
                     )
-                    return __time, __gctime, __memory, __allocs, __return_val
+                    __sample_ref[] = (__time, __gctime, __memory, __allocs)
+                    return nothing
                 end
             end,
         )
diff --git a/src/groups.jl b/src/groups.jl
@@ -336,7 +336,7 @@ function Base.summary(io::IO, group::BenchmarkGroup)
     return print(io, "$(length(group))-element BenchmarkGroup($(tagrepr(group.tags)))")
 end
 
-function Base.show(io::IO, group::BenchmarkGroup)
+function Base.show(@nospecialize(io::IO), group::BenchmarkGroup)
     limit = get(io, :limit, true)
     if !(limit isa Bool)
         msg = (
diff --git a/src/trials.jl b/src/trials.jl
@@ -371,7 +371,7 @@ Base.show(io::IO, t::TrialEstimate) = _show(io, t)
 Base.show(io::IO, t::TrialRatio) = _show(io, t)
 Base.show(io::IO, t::TrialJudgement) = _show(io, t)
 
-function Base.show(io::IO, ::MIME"text/plain", t::Trial)
+function Base.show(@nospecialize(io::IO), ::MIME"text/plain", t::Trial)
     pad = get(io, :pad, "")
     print(
         io,
@@ -578,7 +578,7 @@ function Base.show(io::IO, ::MIME"text/plain", t::Trial)
     return print(io, ".")
 end
 
-function Base.show(io::IO, ::MIME"text/plain", t::TrialEstimate)
+function Base.show(@nospecialize(io::IO), ::MIME"text/plain", t::TrialEstimate)
     println(io, "BenchmarkTools.TrialEstimate: ")
     pad = get(io, :pad, "")
     println(io, pad, "  time:             ", prettytime(time(t)))
@@ -595,7 +595,7 @@ function Base.show(io::IO, ::MIME"text/plain", t::TrialEstimate)
     return print(io, pad, "  allocs:           ", allocs(t))
 end
 
-function Base.show(io::IO, ::MIME"text/plain", t::TrialRatio)
+function Base.show(@nospecialize(io::IO), ::MIME"text/plain", t::TrialRatio)
     println(io, "BenchmarkTools.TrialRatio: ")
     pad = get(io, :pad, "")
     println(io, pad, "  time:             ", time(t))
@@ -604,7 +604,7 @@ function Base.show(io::IO, ::MIME"text/plain", t::TrialRatio)
     return print(io, pad, "  allocs:           ", allocs(t))
 end
 
-function Base.show(io::IO, ::MIME"text/plain", t::TrialJudgement)
+function Base.show(@nospecialize(io::IO), ::MIME"text/plain", t::TrialJudgement)
     println(io, "BenchmarkTools.TrialJudgement: ")
     pad = get(io, :pad, "")
     print(io, pad, "  time:   ", prettydiff(time(ratio(t))), " => ")
diff --git a/test/ExecutionTests.jl b/test/ExecutionTests.jl
@@ -317,6 +317,10 @@ str = String(take!(io))
 b = @bprofile 1 + 1 gctrial = true
 Profile.print(IOContext(io, :displaysize => (24, 200)))
 str = String(take!(io))
+@test !occursin("gcscrub", str)  # no allocs, so gcscrub is skipped even with gctrial=true
+b = @bprofile Ref(1) gctrial = true gcsample = true
+Profile.print(IOContext(io, :displaysize => (24, 200)))
+str = String(take!(io))
 @test occursin("gcscrub", str)
 
 ########
@@ -398,6 +402,15 @@ b = x = nothing
 GC.gc()
 @test x_finalized
 
+# Ensure the harness itself doesn't allocate for a zero-allocation benchmark
+let b = @benchmarkable sin($(1))
+    tune!(b)
+    sample_ref = Ref{BenchmarkTools.SampleResult}((0.0, 0.0, 0, 0))
+    b.samplefunc(b.quote_vals, b.params, sample_ref, nothing)
+    s = sample_ref[]
+    @test s[4] == 0  # allocs
+end
+
 # Ensure mapvals(f) throws MethodError
 @test_throws MethodError BenchmarkTools.mapvals(max)