AxisArrays.jl/src/categoricalvector.jl at e2796cbcaa92cc884b94d0baa7806e1d67118016 · invenia/AxisArrays.jl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

export CategoricalVector

"""
A CategoricalVector is an AbstractVector which is treated as a categorical axis regardless
of the element type. Duplicate values are not allowed but are not filtered out.

A CategoricalVector axis can be indexed with an ClosedInterval, with a value, or with a
vector of values. Use of a CategoricalVector{Tuple} axis allows indexing similar to the
hierarchical index of the Python Pandas package or the R data.table package.

In general, indexing into a CategoricalVector will be much slower than the corresponding
SortedVector or another sorted axis type, as linear search is required.

### Constructors

```julia
CategoricalVector(x::AbstractVector)
```

### Arguments

* `x::AbstractVector` : the wrapped vector

### Examples

```julia
v = CategoricalVector(collect([1; 8; 10:15]))
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b])
A[Axis{:row}(1), :]
A[Axis{:row}(10), :]
A[Axis{:row}([1, 10]), :]

## Hierarchical index example with three key levels

data = reshape(1.:40., 20, 2)
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)]))
A = AxisArray(data, CategoricalVector(v), [:a, :b])
A[:b, :]
A[[:a,:c], :]
A[(:a,:x), :]
A[(:a,:x,:x), :]
```
"""
immutable CategoricalVector{T, A<:AbstractVector{T}} <: AbstractVector{T}
    data::A
end

function CategoricalVector(data::AbstractVector{T}) where T
    CategoricalVector{T, typeof(data)}(data)
end

Base.getindex(v::CategoricalVector, idx::Int) = v.data[idx]
Base.getindex(v::CategoricalVector, idx::AbstractVector) = CategoricalVector(v.data[idx])

Base.length(v::CategoricalVector) = length(v.data)
Base.size(v::CategoricalVector) = size(v.data)
Base.size(v::CategoricalVector, i) = size(v.data, i)
Base.indices(v::CategoricalVector) = indices(v.data)

axistrait(::Type{CategoricalVector{T,A}}) where {T,A} = Categorical
checkaxis(::CategoricalVector) = nothing


## Add some special indexing for CategoricalVector{Tuple}'s to achieve something like
## Panda's hierarchical indexing

axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx) = axisindexes(ax, (idx,))

function axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::Tuple)
    collect(filter(ax_idx->_tuple_matches(ax.val[ax_idx], idx), indices(ax.val)...))
end

function _tuple_matches(element::Tuple, idx::Tuple)
    length(idx) <= length(element) || return false

    for (x, y) in zip(element, idx)
        x == y || return false
    end

    return true
end

axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::AbstractArray) =
    vcat([axisindexes(ax, i) for i in idx]...)