Add quantile/value space functions (#94)

Datseris · web-flow · commit 0a7639c8fb40 · 2022-06-17T13:29:25.000+02:00
* add first code version

* reorganize into subfolder

* remove unecessary return of `weights`

by definition all weights are the same up to +/- 1 element.

* use reusable function for averages from indices

* add value space

* add tests for value space

* add docs and version

* fix typo

* remove ambivuous setindex method...?
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "ClimateBase"
 uuid = "35604d93-0fb8-4872-9436-495b01d137e2"
 authors = ["Datseris <datseris.george@gmail.com>", "Philippe Roy <borghor@yahoo.ca>"]
-version = "0.16.0"
+version = "0.16.1"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
diff --git a/docs/src/advanced.md b/docs/src/advanced.md
@@ -18,6 +18,12 @@ interpolate_height2pressure
 interpolate_pressure2height
 ```
 
+## Quantile/value space
+```@docs
+quantile_space
+value_space
+```
+
 ## Climate quantities
 Functions that calculate climate-related quantities.
 ```@docs
diff --git a/src/ClimateBase.jl b/src/ClimateBase.jl
@@ -4,7 +4,6 @@ export NCDataset
 include("core/coredefs.jl")
 include("core/prettyprint.jl")
 include("core/aggregation.jl")
-include("core/interpolation.jl")
 
 include("physical_dimensions/spatial.jl")
 include("physical_dimensions/spatial_equalarea.jl")
@@ -13,6 +12,9 @@ include("physical_dimensions/temporal.jl")
 include("io/vector2range.jl")
 include("io/netcdf.jl")
 
+include("interpolations/height_interpolation.jl")
+include("interpolations/quantile_space.jl")
+
 # All following will be moved to ClimateTools.jl once its updated
 include("climate/solar.jl")
 include("tsa/continuation.jl")
diff --git a/src/core/coredefs.jl b/src/core/coredefs.jl
@@ -8,6 +8,7 @@ using DimensionalData.Dimensions: setdims
 using Dates
 
 Time = DimensionalData.Ti
+Tim = DimensionalData.Ti
 
 AbDimArray = DimensionalData.AbstractDimArray
 
@@ -30,7 +31,7 @@ gnv(x::Dimension) = parent(parent(x))
 
 export At, (..), Between, Near # Selectors from DimensionalArrays.jl
 export hasdim, dims, dimindex
-export Time, Lon, Lat, dims, Coord, Hei, Pre, Ti
+export Time, Lon, Lat, dims, Coord, Hei, Pre, Ti, Tim
 export CoordinateSpace, OrthogonalSpace, spacestructure
 export DimensionalData # for accessing its functions
 export setdims
@@ -174,8 +175,6 @@ ClimArray(A::AbstractArray, dims::Tuple, name; refdims=(), attrib=nothing) =
     ClimArray(A, format(dims, A), refdims, Symbol(name), attrib)
 
 Base.parent(A::ClimArray) = A.data
-Base.@propagate_inbounds Base.setindex!(A::ClimArray, x, I::Vararg{DimensionalData.StandardIndices}) =
-    setindex!(A.data, x, I...)
 
 DimensionalData.metadata(A::ClimArray) = A.attrib
 DimensionalData.basetypeof(::ClimArray) = ClimArray
diff --git a/src/interpolations/height_interpolation.jl b/src/interpolations/height_interpolation.jl
diff --git a/src/interpolations/quantile_space.jl b/src/interpolations/quantile_space.jl
@@ -0,0 +1,127 @@
+import StatsBase
+export quantile_space, value_space
+
+###########################################################################################
+# Quantile space
+###########################################################################################
+"""
+    quantile_space(A, B; n = 50) → Aq, Bq, q
+Express the array `B` into the quantile space of `A`. `B, A` must have the same indices.
+
+The array `B` is binned according to the quantile values of the elements of `A`.
+`n` bins are created in total and the bin edges `p` are returned.
+The `i`-th bin contains data whose `A`-quantile values are ∈ [`q[i]`, `q[i+1]`).
+The indices of these `A` values are used to group `B`.
+
+In each bin, the binned values of `A, B` are averaged, resulting in `Aq, Bq`.
+
+The amount of datapoints per quantile is by definition `length(A)/n`.
+"""
+function quantile_space(A, B; n = 50)
+    @assert size(A) == size(B)
+    bin_idxs, qs = quantile_decomposition(A, n)
+    Aquant, Bquant = averages_from_indices(bin_idxs, A, B)
+    return Aquant, Bquant, qs
+end
+
+function quantile_decomposition(A, n)
+    Avec = vec(A)
+    qs = range(0, 1; length = n+1)
+    bin_width = Base.step(qs)
+    A_cdf = StatsBase.ecdf(Avec)
+    A_quantiles = A_cdf.(Avec)
+    bin_idxs = [Int[] for _ in 1:length(qs)-1]
+    for (i, q) in enumerate(A_quantiles)
+        j = Int(q÷bin_width) + 1 # index of quantile bin
+        push!(bin_idxs[j], i)
+    end
+    return bin_idxs, qs
+end
+
+function averages_from_indices(idxs, As...)
+    map(A -> map(i -> StatsBase.mean(view(vec(A), i)), idxs), As)
+end
+
+# TODO:
+# function quantile_space(A, B, C; n = 50)
+# I am honestly not sure how to go about that.
+
+###########################################################################################
+# Quantile space
+###########################################################################################
+"""
+    value_space(A, B; Arange) → Bmeans, bin_indices
+Express the array `B` into the value space of `A`. `B, A` must have the same indices.
+This means that `A` is binned according to the given `Arange`,
+and the same indices that binned `A` are also used to bin `B`.
+The `i`-th bin contains data whose `A` values ∈ [`Arange[i]`, `Arange[i+1]`).
+In each bin, the binned values of `B` are averaged, resulting in `Bmeans`.
+
+Elements of `A` that are not ∈ `Arange` are skipped.
+The returned `bin_indices` are the indices in each bin (hence, the weights
+for the means are just `length.(bin_indices)`)
+
+By default `Arange = range(minimum(A), nextfloat(maximum(A)); length = 100)`.
+"""
+function value_space(A, B; Arange = _default_val_range(A))
+    @assert size(A) == size(B)
+    @assert issorted(Arange)
+    bin_idxs = indices_in_values(A, Arange)
+    Bmeans, = averages_from_indices(bin_idxs, B)
+    return Bmeans, bin_idxs
+end
+
+_default_val_range(A) = range(minimum(A), nextfloat(maximum(A)); length = 100)
+
+function indices_in_values(A, Arange)
+    bin_idxs = [Int[] for _ in 1:length(Arange)-1]
+    # `li` are linear indices of `A`. `bi` are bin indices of the values of `A`.
+    for (li, a) in enumerate(vec(A))
+        a > maximum(Arange) && continue
+        bi = searchsortedlast(Arange, a)
+        bi == 0 && continue
+        push!(bin_idxs[bi], li)
+    end
+    return bin_idxs
+end
+
+# 2D version
+"""
+    value_space(A, B, C; Arange, Brange) → Cmeans, bin_indices
+Express the array `C` into the joint value space of `A, B`.
+`A, B, C` must have the same indices.
+
+A 2D histogram is created based on the given ranges, the and elements of `C` are binned
+according to the values of `A, B`. Then, the elements are averaged, which returns a matrix
+`Cmeans`, defined over the joint space S = `Arange × Brange`.
+`bin_indices` is also a matrix (with vector elements).
+`Cmeans` is `NaN` for bins without any elements.
+"""
+function value_space(A, B, C;
+        Arange = _default_val_range(A), Brange = _default_val_range(B)
+    )
+    @assert size(A) == size(B) == size(C)
+    @assert issorted(Arange)
+    bin_idxs_1D = indices_in_values(A, Arange)
+    bin_idxs_2D = refine_indices_in_values(bin_idxs_1D, Arange, B, Brange)
+    Cmeans, = averages_from_indices(bin_idxs_2D, C)
+    return Cmeans, bin_idxs_2D
+end
+
+
+function refine_indices_in_values(bin_idxs_1D, Arange, B, Brange)
+    vecB = vec(B)
+    bin_idxs = Matrix{Vector{Int}}(undef, length(Arange)-1, length(Brange)-1)
+    for k in eachindex(bin_idxs); bin_idxs[k] = Int[]; end
+    for (bi, idxs) in enumerate(bin_idxs_1D) # iterate over bins of A
+        # notice that `idxs` is a container of linear indices for B!
+        for li in idxs
+            b = vecB[li]
+            b > maximum(Brange) && continue
+            bj = searchsortedlast(Brange, b)
+            bj == 0 && continue
+            push!(bin_idxs[bi, bj], li)
+        end
+    end
+    return bin_idxs
+end
diff --git a/test/advanced_tests.jl b/test/advanced_tests.jl
@@ -17,3 +17,40 @@
     @test D_back[Hei(1)] < D_back[Hei(2)] < D_back[Hei(3)] < D_back[Hei(4)] < D_back[Hei(5)]
     @test D[Hei(1)] < D_pre[Pre(3)] < D[Hei(11)]
 end
+
+@testset "quantile space" begin
+
+end
+
+@testset "value space" begin
+    t = 0:0.01:5π
+    x = cos.(t)
+    y = sin.(t)
+    rx = range(-1.0, nextfloat(1.0); length = 21)
+    @testset "1D" begin
+        using StatsBase
+        ymeans, bin_indices = value_space(x, y; Arange = rx)
+        weights = length.(bin_indices)
+        @test sum(weights) == length(y)
+        # trigonometric functions have conentrated value weight at the edges
+        @test weights[1] > weights[length(weights)÷2]
+        # sine must be very small when cosine is large
+        @test ymeans[1] < weights[length(weights)÷2]
+        @test mean(ymeans, Weights(weights)) ≈ StatsBase.mean(y)
+    end
+    @testset "2D" begin
+        z = y
+        zmeans, bin_indices = value_space(x, y, z; Arange = rx, Brange = rx)
+        # Because cozine and size cannever be 1 at the same time, all corners
+        # of z must be nan:
+        L = length(rx)-1
+        @test all(isnan, [zmeans[1,1], zmeans[1,L], zmeans[L,1], zmeans[L,L]])
+        # and again because of where cosines and sines may have values at the same time
+        # zmeans only is non NaN at specific locations one can derive (depends on step size)
+        non_nan_i = 6:15
+        @test all(!isnan, zmeans[1, non_nan_i])
+        @test all(!isnan, zmeans[non_nan_i, end])
+        # and because z is y, it must increase along the second dimension
+        @test issorted(zmeans[1, non_nan_i])
+    end
+end