Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
julia 0.6
IntervalSets 0.1
IterTools
RangeArrays
5 changes: 4 additions & 1 deletion src/AxisArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ __precompile__()
module AxisArrays

using Base: tail
import Base.Iterators: repeated
using RangeArrays, IntervalSets
using IterTools

export AxisArray, Axis, axisnames, axisvalues, axisdim, axes, atindex, atvalue
export AxisArray, Axis, axisnames, axisvalues, axisdim, axes, atindex, atvalue, collapse

# From IntervalSets:
export ClosedInterval, ..
Expand All @@ -15,6 +17,7 @@ include("intervals.jl")
include("search.jl")
include("indexing.jl")
include("sortedvector.jl")
include("categoricalvector.jl")
include("combine.jl")

end
85 changes: 85 additions & 0 deletions src/categoricalvector.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@

export CategoricalVector
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use a different name for this, as CategoricalVector is already used in CategoricalArrays, which replaced PooledDataArray in DataTables (and soon in DataFrames). Why not CategoricalAxis?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not an Axis, and it mirrors the SortedVector type.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, IIUC its only purpose is to treat it as a categorical axis, isn't it? Ideas about other possible names? It would be too bad to have conflicts when loading both AxisArrays and DataTables/DataFrames.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could choose not to export it?

There aren't conflicts when you use both packages unless you also use CategoricalVector.

The nomenclature used within AxisArrays is Categorical, which is how CategoricalVector came up.

How about DiscreteVector?


"""
A CategoricalVector is an AbstractVector which is treated as a categorical axis regardless
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing backticks around types in this docstring.

of the element type. Duplicate values are not allowed but are not filtered out.

A CategoricalVector axis can be indexed with an ClosedInterval, with a value, or with a
vector of values. Use of a CategoricalVector{Tuple} axis allows indexing similar to the
hierarchical index of the Python Pandas package or the R data.table package.

In general, indexing into a CategoricalVector will be much slower than the corresponding
SortedVector or another sorted axis type, as linear search is required.

### Constructors

```julia
CategoricalVector(x::AbstractVector)
```

### Arguments

* `x::AbstractVector` : the wrapped vector

### Examples

```julia
v = CategoricalVector(collect([1; 8; 10:15]))
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b])
A[Axis{:row}(1), :]
A[Axis{:row}(10), :]
A[Axis{:row}([1, 10]), :]

## Hierarchical index example with three key levels

data = reshape(1.:40., 20, 2)
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)]))
A = AxisArray(data, CategoricalVector(v), [:a, :b])
A[:b, :]
A[[:a,:c], :]
A[(:a,:x), :]
A[(:a,:x,:x), :]
```
"""
immutable CategoricalVector{T, A<:AbstractVector{T}} <: AbstractVector{T}
data::A
end

function CategoricalVector(data::AbstractVector{T}) where T
CategoricalVector{T, typeof(data)}(data)
end

Base.getindex(v::CategoricalVector, idx::Int) = v.data[idx]
Base.getindex(v::CategoricalVector, idx::AbstractVector) = CategoricalVector(v.data[idx])

Base.length(v::CategoricalVector) = length(v.data)
Base.size(v::CategoricalVector) = size(v.data)
Base.size(v::CategoricalVector, i) = size(v.data, i)
Base.indices(v::CategoricalVector) = indices(v.data)

axistrait(::Type{CategoricalVector{T,A}}) where {T,A} = Categorical
checkaxis(::CategoricalVector) = nothing


## Add some special indexing for CategoricalVector{Tuple}'s to achieve something like
## Panda's hierarchical indexing

axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx) = axisindexes(ax, (idx,))

function axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::Tuple)
collect(filter(ax_idx->_tuple_matches(ax.val[ax_idx], idx), indices(ax.val)...))
end

function _tuple_matches(element::Tuple, idx::Tuple)
length(idx) <= length(element) || return false

for (x, y) in zip(element, idx)
x == y || return false
end

return true
end

axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::AbstractArray) =
vcat([axisindexes(ax, i) for i in idx]...)
195 changes: 195 additions & 0 deletions src/combine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,198 @@ function Base.join{T,N,D,Ax}(As::AxisArray{T,N,D,Ax}...; fillvalue::T=zero(T),
return result

end #join

function _flatten_array_axes(array_name, array_axes...)
((array_name, (idx isa Tuple ? idx : (idx,))...) for idx in product((Ax.val for Ax in array_axes)...))
end

function _flatten_axes(array_names, array_axes)
collect(Iterators.flatten(map(array_names, array_axes) do tup_name, tup_array_axes
_flatten_array_axes(tup_name, tup_array_axes...)
end))
end

function _splitall{N}(::Type{Val{N}}, As...)
tuple((Base.IteratorsMD.split(A, Val{N}) for A in As)...)
end

function _reshapeall{N}(::Type{Val{N}}, As...)
tuple((reshape(A, Val{N}) for A in As)...)
end

function _check_common_axes(common_axis_tuple)
if !all(axisname(first(common_axis_tuple)) .=== axisname.(common_axis_tuple[2:end]))
throw(ArgumentError("Leading common axes must have the same name in each array"))
end

return nothing
end

function _flat_axis_eltype(LType, trailing_axes)
eltypes = map(trailing_axes) do array_trailing_axes
Tuple{LType, eltype.(array_trailing_axes)...}
end

return typejoin(eltypes...)
end

function collapse{N, AN}(::Type{Val{N}}, As::Vararg{AxisArray, AN})
collapse(Val{N}, ntuple(identity, Val{AN}), As...)
end

function collapse{N, AN, NewArrayType<:AbstractArray}(::Type{Val{N}}, ::Type{NewArrayType}, As::Vararg{AxisArray, AN})
collapse(Val{N}, NewArrayType, ntuple(identity, Val{AN}), As...)
end

@generated function collapse{N, AN, LType}(::Type{Val{N}}, labels::NTuple{AN, LType}, As::Vararg{AxisArray, AN})
flat_dim = Val{N + 1}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not used?

flat_dim_int = Int(N) + 1
new_eltype = Base.promote_eltype(As...)

quote
collapse(Val{N}, Array{$new_eltype, $flat_dim_int}, labels, As...)
end
end

"""
collapse(::Type{Val{N}}, As::AxisArray...) -> AxisArray
collapse(::Type{Val{N}}, labels::Tuple, As::AxisArray...) -> AxisArray
collapse(::Type{Val{N}}, ::Type{NewArrayType}, As::AxisArray...) -> AxisArray
collapse(::Type{Val{N}}, ::Type{NewArrayType}, labels::Tuple, As::AxisArray...) -> AxisArray

Collapses `AxisArray`s with `N` equal leading axes into a single `AxisArray`.
All additional axes in any of the arrays are flattened into a single additional
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

collapsed

`CategoricalVector{Tuple}` axis.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment on the label of the new axis?


### Arguments

* `::Type{Val{N}}`: the greatest common dimension to share between all input
arrays. The remaining axes are flattened. All `N` axes must be common
to each input array, at the same dimension. Values from `0` up to the
minimum number of dimensions across all input arrays are allowed.
* `labels::Tuple`: (optional) a label for each `AxisArray` in `As` which is used in the flat
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not entirely clear. Maybe "Index assigned to each array in As in the :collapsed axis"? Since it's optional, also specify what happens by default.

axis.
* `::Type{NewArrayType<:AbstractArray{_, N+1}}`: (optional) the desired underlying array
type for the returned `AxisArray`.
* `As::AxisArray...`: `AxisArray`s to be collapsed together.

### Examples

```
julia> price_data = AxisArray(rand(10), Axis{:time}(Date(2016,01,01):Day(1):Date(2016,01,10)))
1-dimensional AxisArray{Float64,1,...} with axes:
:time, 2016-01-01:1 day:2016-01-10
And data, a 10-element Array{Float64,1}:
0.885014
0.418562
0.609344
0.72221
0.43656
0.840304
0.455337
0.65954
0.393801
0.260207

julia> size_data = AxisArray(rand(10,2), Axis{:time}(Date(2016,01,01):Day(1):Date(2016,01,10)), Axis{:measure}([:area, :volume]))
2-dimensional AxisArray{Float64,2,...} with axes:
:time, 2016-01-01:1 day:2016-01-10
:measure, Symbol[:area, :volume]
And data, a 10×2 Array{Float64,2}:
0.159434 0.456992
0.344521 0.374623
0.522077 0.313256
0.994697 0.320953
0.95104 0.900526
0.921854 0.729311
0.000922581 0.148822
0.449128 0.761714
0.650277 0.135061
0.688773 0.513845

julia> collapsed = collapse(Val{1}, (:price, :size), price_data, size_data)
2-dimensional AxisArray{Float64,2,...} with axes:
:time, 2016-01-01:1 day:2016-01-10
:flat, Tuple{Symbol,Vararg{Symbol,N} where N}[(:price,), (:size, :area), (:size, :volume)]
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will need to be fixed if you change the name to :collapsed

And data, a 10×3 Array{Float64,2}:
0.885014 0.159434 0.456992
0.418562 0.344521 0.374623
0.609344 0.522077 0.313256
0.72221 0.994697 0.320953
0.43656 0.95104 0.900526
0.840304 0.921854 0.729311
0.455337 0.000922581 0.148822
0.65954 0.449128 0.761714
0.393801 0.650277 0.135061
0.260207 0.688773 0.513845

julia> collapsed[Axis{:flat}(:size)] == size_data
true
```

"""
@generated function collapse(::Type{Val{N}},
::Type{NewArrayType},
labels::NTuple{AN, LType},
As::Vararg{AxisArray, AN}) where {N, AN, LType, NewArrayType<:AbstractArray}
if N < 0
throw(ArgumentError("collapse dimension N must be at least 0"))
end

if N > minimum(ndims.(As))
throw(ArgumentError(
"""
collapse dimension N must not be greater than the maximum number of dimensions
across all input arrays
"""
))
end

flat_dim = Val{N + 1}
flat_dim_int = Int(N) + 1

common_axes, trailing_axes = zip(_splitall(Val{N}, axisparams.(As)...)...)

foreach(_check_common_axes, zip(common_axes...))

new_common_axes = first(common_axes)
flat_axis_eltype = _flat_axis_eltype(LType, trailing_axes)
flat_axis_type = CategoricalVector{flat_axis_eltype, Vector{flat_axis_eltype}}

new_axes_type = Tuple{new_common_axes..., Axis{:flat, flat_axis_type}}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:collapsed?

new_eltype = Base.promote_eltype(As...)

quote
common_axes, trailing_axes = zip(_splitall(Val{N}, axes.(As)...)...)

for common_axis_tuple in zip(common_axes...)
if !isempty(common_axis_tuple)
for common_axis in common_axis_tuple[2:end]
if !all(axisvalues(common_axis) .== axisvalues(common_axis_tuple[1]))
throw(ArgumentError(
"""
Leading common axes must be identical across
all input arrays"""
))
end
end
end
end

array_data = cat($flat_dim, _reshapeall($flat_dim, As...)...)

axis_array_type = AxisArray{
$new_eltype,
$flat_dim_int,
$NewArrayType,
$new_axes_type
}

new_axes = (
first(common_axes)...,
Axis{:flat, $flat_axis_type}($flat_axis_type(_flatten_axes(labels, trailing_axes))),
)

return axis_array_type(array_data, new_axes)
end
end
9 changes: 9 additions & 0 deletions src/core.jl
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,15 @@ end
axes(A::AbstractArray) = default_axes(A)
axes(A::AbstractArray, dim::Int) = default_axes(A)[dim]

"""
axisparams(::AxisArray) -> Vararg{::Type{Axis}}
axisparams(::Type{AxisArray}) -> Vararg{::Type{Axis}}

Returns the axis parameters for an AxisArray.
"""
axisparams{T,N,D,Ax}(::AxisArray{T,N,D,Ax}) = (Ax.parameters...)
axisparams{T,N,D,Ax}(::Type{AxisArray{T,N,D,Ax}}) = (Ax.parameters...)

### Axis traits ###
abstract type AxisTrait end
immutable Dimensional <: AxisTrait end
Expand Down
17 changes: 16 additions & 1 deletion src/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,17 @@ end
ex = Expr(:tuple)
n = 0
for i=1:length(I)
if axistrait(I[i]) <: Categorical && i <= length(Ax.parameters)
if I[i] <: Axis
push!(ex.args, :(axisindexes(A.axes[$i], I[$i].val)))
else
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
end
n += 1

continue
end

if I[i] <: Idx
push!(ex.args, :(I[$i]))
n += 1
Expand All @@ -243,7 +254,11 @@ end
end
n += length(I[i])
elseif i <= length(Ax.parameters)
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
if I[i] <: Axis
push!(ex.args, :(axisindexes(A.axes[$i], I[$i].val)))
else
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
end
n += 1
else
push!(ex.args, :(error("dimension ", $i, " does not have an axis to index")))
Expand Down
21 changes: 21 additions & 0 deletions test/categoricalvector.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Test CategoricalVector with a hierarchical index (indexed using Tuples)
srand(1234)
data = reshape(1.:40., 20, 2)
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)]))
idx = sortperm(v)
A = AxisArray(data[idx,:], CategoricalVector(v[idx]), [:a, :b])
@test A[:b, :] == A[5:12, :]
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume these reflect the actual random numbers produced due to srand(1234)? Would it be clearer to just hardcode v?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would; I was just following test/sortedvector.jl as much as possible. I can change it though :)

@test A[[:a,:c], :] == A[[1:4;13:end], :]
@test A[(:a,:y), :] == A[2:4, :]
@test A[(:c,:y,:y), :] == A[16:end, :]
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical

v = CategoricalVector(collect([1; 8; 10:15]))
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b])
@test A[Axis{:row}(CategoricalVector([15]))] == AxisArray(reshape(A.data[8, :], 1, 2), CategoricalVector([15]), [:a, :b])
@test A[Axis{:row}(CategoricalVector([15])), 1] == AxisArray([A.data[8, 1]], CategoricalVector([15]))
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical

# TODO: maybe make this work? Would require removing or modifying Base.getindex(A::AxisArray, idxs::Idx...)
# @test A[CategoricalVector([15]), 1] == AxisArray([A.data[8, 1]], CategoricalVector([15]))
Loading