Echinocoder/C0HomDeg1_dotting_encoder_for_array_of_reals_as_multiset.py at main · kesterlester/Echinocoder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import numpy as np
from tools import sort_each_np_array_row
#import hashlib
from MultisetEncoder import MultisetEncoder
from typing import Any


class Encoder(MultisetEncoder):
    """
    This encoder encodes via sorted length-n lists of dot products.
    The first k-such lists are with standard coordinate axes.
    After that come "extra_dots" extra dot products. extra_dots is set in the constructor.
    """

    def __init__(self, k: int, extra_dots: int):
        super().__init__()
        if k<0 or extra_dots<0:
            raise ValueError() # It makes no sense to claim we will code vectors or sets for k<0 or extra_dots<0

        self.k = k
        self.extra_dots = extra_dots
        self.total_dots = k + extra_dots

        rng = np.random.Generator(np.random.PCG64(0))

        self.matrix = np.concatenate((np.identity(k), rng.standard_normal((self.extra_dots, self.k))), axis=0) # Technically this matrix should then be checked -- to see that none of its kxk minors have zero determinant. But for now we "assume" that that is true.

    def encode(self, data: np.ndarray, debug=False) -> (np.ndarray, (int, int), Any):
        if debug:
            print(f"data is {data}")

        n,k = data.shape

        # Catch a few special cases:
        if n==0 or k==0:
            encoding = np.array([], dtype=np.float64)
            assert len(encoding) == self.size_from_n_k(n, k)
            return encoding, (n,k), None

        if k != self.k:
            assert self.size_from_n_k(n,k) == -1
            raise ValueError(f"This encoder is setup for k={self.k} so does not like data having k={k}")

        if debug:
            print(f"About to muliply {self.matrix} by {data.T}")

        ##### HERE IS THE ACTUAL ENCODING:     ###########
        encoding = sort_each_np_array_row(self.matrix @ data.T).flatten()
        ##### THE ACTUAL ENCODING IS COMPLETE! ###########

        if debug:
            print(f"Encoding is {encoding} with length {len(encoding)} which should be {self.size_from_n_k(n,k)}")
        assert len(encoding) == self.size_from_n_k(n, k)
        return encoding, (n,k), None

    def decode(self, encoding, encoding_n_and_k):
        return None

    def size_from_n_k(self, n: int, k: int) -> int:
        if k==0 or n==0:
            return 0
        if k != self.k:
            return -1
        return n*(k+self.extra_dots)


## def hash_to_128_bit_md5_int(md5):
##     return int.from_bytes(md5.digest(), 'big') # 128 bits worth.
##
## def hash_to_64_bit_reals_in_unit_interval(md5):
##     """
##     An md5 sum is 64 bits long so we get two such reals.
##     N.B. This hash is of self._eji_counts only -- i.e. it ignores self._index.
##     For the purposes to which this hash will be used, that is believed to be apporopriate.
##     """
##
##     x = hash_to_128_bit_md5_int(md5)
##     bot_64_bits = x & 0xffFFffFFffFFffFF
##     top_64_bits = x >> 64
##     return np.float64(top_64_bits)/(1 << 64), np.float64(bot_64_bits)/(1 << 64)


def tost(): # Renamed from test -> tost to avoid pycharm mis-detecting / mis-running unit tests!
    encoder = Encoder(k=2,extra_dots=6)
    print("Encoder matrix is\n",encoder.matrix)
    assert encoder.size_from_n_k(5,2) == 5*(2+6)
    #calculated = np.array([2, 3, 4, 1, 0])
    #expected = np.array([2, 3, 4, 1, 0])
    #np.testing.assert_array_equal(calculated, expected)


def run_unit_tests():
    tost() # Renamed from test -> tost to avoid pycharm mis-detecting / mis-running unit tests!

def main():
    run_unit_tests()

    encoder = Encoder(k=2, extra_dots=10)
    good_input = np.asarray([[4,2],[-3,5],[8,9],[2,7]])
    output = encoder.encode(good_input, debug=True)

    print("Encoding:")
    print(f"{good_input}")
    print("leads to:")
    print(f"{output}")

if __name__ == "__main__":
    main()