|
| 1 | +import StatsBase |
| 2 | +export quantile_space, value_space |
| 3 | + |
| 4 | +########################################################################################### |
| 5 | +# Quantile space |
| 6 | +########################################################################################### |
| 7 | +""" |
| 8 | + quantile_space(A, B; n = 50) → Aq, Bq, q |
| 9 | +Express the array `B` into the quantile space of `A`. `B, A` must have the same indices. |
| 10 | +
|
| 11 | +The array `B` is binned according to the quantile values of the elements of `A`. |
| 12 | +`n` bins are created in total and the bin edges `p` are returned. |
| 13 | +The `i`-th bin contains data whose `A`-quantile values are ∈ [`q[i]`, `q[i+1]`). |
| 14 | +The indices of these `A` values are used to group `B`. |
| 15 | +
|
| 16 | +In each bin, the binned values of `A, B` are averaged, resulting in `Aq, Bq`. |
| 17 | +
|
| 18 | +The amount of datapoints per quantile is by definition `length(A)/n`. |
| 19 | +""" |
| 20 | +function quantile_space(A, B; n = 50) |
| 21 | + @assert size(A) == size(B) |
| 22 | + bin_idxs, qs = quantile_decomposition(A, n) |
| 23 | + Aquant, Bquant = averages_from_indices(bin_idxs, A, B) |
| 24 | + return Aquant, Bquant, qs |
| 25 | +end |
| 26 | + |
| 27 | +function quantile_decomposition(A, n) |
| 28 | + Avec = vec(A) |
| 29 | + qs = range(0, 1; length = n+1) |
| 30 | + bin_width = Base.step(qs) |
| 31 | + A_cdf = StatsBase.ecdf(Avec) |
| 32 | + A_quantiles = A_cdf.(Avec) |
| 33 | + bin_idxs = [Int[] for _ in 1:length(qs)-1] |
| 34 | + for (i, q) in enumerate(A_quantiles) |
| 35 | + j = Int(q÷bin_width) + 1 # index of quantile bin |
| 36 | + push!(bin_idxs[j], i) |
| 37 | + end |
| 38 | + return bin_idxs, qs |
| 39 | +end |
| 40 | + |
| 41 | +function averages_from_indices(idxs, As...) |
| 42 | + map(A -> map(i -> StatsBase.mean(view(vec(A), i)), idxs), As) |
| 43 | +end |
| 44 | + |
| 45 | +# TODO: |
| 46 | +# function quantile_space(A, B, C; n = 50) |
| 47 | +# I am honestly not sure how to go about that. |
| 48 | + |
| 49 | +########################################################################################### |
| 50 | +# Quantile space |
| 51 | +########################################################################################### |
| 52 | +""" |
| 53 | + value_space(A, B; Arange) → Bmeans, bin_indices |
| 54 | +Express the array `B` into the value space of `A`. `B, A` must have the same indices. |
| 55 | +This means that `A` is binned according to the given `Arange`, |
| 56 | +and the same indices that binned `A` are also used to bin `B`. |
| 57 | +The `i`-th bin contains data whose `A` values ∈ [`Arange[i]`, `Arange[i+1]`). |
| 58 | +In each bin, the binned values of `B` are averaged, resulting in `Bmeans`. |
| 59 | +
|
| 60 | +Elements of `A` that are not ∈ `Arange` are skipped. |
| 61 | +The returned `bin_indices` are the indices in each bin (hence, the weights |
| 62 | +for the means are just `length.(bin_indices)`) |
| 63 | +
|
| 64 | +By default `Arange = range(minimum(A), nextfloat(maximum(A)); length = 100)`. |
| 65 | +""" |
| 66 | +function value_space(A, B; Arange = _default_val_range(A)) |
| 67 | + @assert size(A) == size(B) |
| 68 | + @assert issorted(Arange) |
| 69 | + bin_idxs = indices_in_values(A, Arange) |
| 70 | + Bmeans, = averages_from_indices(bin_idxs, B) |
| 71 | + return Bmeans, bin_idxs |
| 72 | +end |
| 73 | + |
| 74 | +_default_val_range(A) = range(minimum(A), nextfloat(maximum(A)); length = 100) |
| 75 | + |
| 76 | +function indices_in_values(A, Arange) |
| 77 | + bin_idxs = [Int[] for _ in 1:length(Arange)-1] |
| 78 | + # `li` are linear indices of `A`. `bi` are bin indices of the values of `A`. |
| 79 | + for (li, a) in enumerate(vec(A)) |
| 80 | + a > maximum(Arange) && continue |
| 81 | + bi = searchsortedlast(Arange, a) |
| 82 | + bi == 0 && continue |
| 83 | + push!(bin_idxs[bi], li) |
| 84 | + end |
| 85 | + return bin_idxs |
| 86 | +end |
| 87 | + |
| 88 | +# 2D version |
| 89 | +""" |
| 90 | + value_space(A, B, C; Arange, Brange) → Cmeans, bin_indices |
| 91 | +Express the array `C` into the joint value space of `A, B`. |
| 92 | +`A, B, C` must have the same indices. |
| 93 | +
|
| 94 | +A 2D histogram is created based on the given ranges, the and elements of `C` are binned |
| 95 | +according to the values of `A, B`. Then, the elements are averaged, which returns a matrix |
| 96 | +`Cmeans`, defined over the joint space S = `Arange × Brange`. |
| 97 | +`bin_indices` is also a matrix (with vector elements). |
| 98 | +`Cmeans` is `NaN` for bins without any elements. |
| 99 | +""" |
| 100 | +function value_space(A, B, C; |
| 101 | + Arange = _default_val_range(A), Brange = _default_val_range(B) |
| 102 | + ) |
| 103 | + @assert size(A) == size(B) == size(C) |
| 104 | + @assert issorted(Arange) |
| 105 | + bin_idxs_1D = indices_in_values(A, Arange) |
| 106 | + bin_idxs_2D = refine_indices_in_values(bin_idxs_1D, Arange, B, Brange) |
| 107 | + Cmeans, = averages_from_indices(bin_idxs_2D, C) |
| 108 | + return Cmeans, bin_idxs_2D |
| 109 | +end |
| 110 | + |
| 111 | + |
| 112 | +function refine_indices_in_values(bin_idxs_1D, Arange, B, Brange) |
| 113 | + vecB = vec(B) |
| 114 | + bin_idxs = Matrix{Vector{Int}}(undef, length(Arange)-1, length(Brange)-1) |
| 115 | + for k in eachindex(bin_idxs); bin_idxs[k] = Int[]; end |
| 116 | + for (bi, idxs) in enumerate(bin_idxs_1D) # iterate over bins of A |
| 117 | + # notice that `idxs` is a container of linear indices for B! |
| 118 | + for li in idxs |
| 119 | + b = vecB[li] |
| 120 | + b > maximum(Brange) && continue |
| 121 | + bj = searchsortedlast(Brange, b) |
| 122 | + bj == 0 && continue |
| 123 | + push!(bin_idxs[bi, bj], li) |
| 124 | + end |
| 125 | + end |
| 126 | + return bin_idxs |
| 127 | +end |
0 commit comments