v/vlib/compress/gzip/gzip.v at master · vlang/v · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
// [rfc1952](https://datatracker.ietf.org/doc/html/rfc1952) compliant
// gzip compression/decompression

module gzip

import compress as compr
import hash.crc32

// CompressFlags
// TODO: These flags have no use now
@[flag]
pub enum CompressFlags {
	// The low 12 bits will be overwritten by `compression_level`
	compression_level_overwrite_flag01
	compression_level_overwrite_flag02
	compression_level_overwrite_flag03
	compression_level_overwrite_flag04
	compression_level_overwrite_flag05
	compression_level_overwrite_flag06
	compression_level_overwrite_flag07
	compression_level_overwrite_flag08
	compression_level_overwrite_flag09
	compression_level_overwrite_flag10
	compression_level_overwrite_flag11
	compression_level_overwrite_flag12

	// If set, the compressor outputs a zlib header before the deflate data, and the Adler-32 of the source data at the end. Otherwise, you'll get raw deflate data.
	write_zlib_header //= 0x01000
	// Always compute the adler-32 of the input data (even when not writing zlib headers).
	compute_adler32 //= 0x02000
	// Set to use faster greedy parsing, instead of more efficient lazy parsing.
	greedy_parsing_flag //= 0x04000
	// Enable to decrease the compressor's initialization time to the minimum, but the output may vary from run to run given the same input (depending on the contents of memory).
	nondeterministic_parsing_flag //= 0x08000
	// Only look for RLE matches (matches with a distance of 1)
	rle_matches //= 0x10000
	// Discards matches <= 5 chars if enabled.
	filter_matches //= 0x20000
	// Disable usage of optimized Huffman tables.
	force_all_static_blocks //= 0x40000
	// Only use raw (uncompressed) deflate blocks.
	force_all_raw_blocks //= 0x80000
}

// CompressParams set compression_level for compression:
// 0: Huffman only;
// 1: Huffman+LZ (fastest/crap compression);
// 128: default_max_probes;
// 4095: Huffman+LZ (slowest/best compression)
@[params]
pub struct CompressParams {
pub:
	compression_level int = 128 // 0~4095
	flags             CompressFlags
}

// compresses an array of bytes using gzip and returns the compressed bytes in a new array
// Example: b := 'abcde'.repeat(1000).bytes(); cmprsd := gzip.compress(b, compression_level:4095)!; assert cmprsd.len == 47
// Note: compression_level 0~4095
pub fn compress(data []u8, params CompressParams) ![]u8 {
	if params.compression_level !in 0..4096 {
		return error('compression level should in [0,4095]')
	}
	// The low 12 bits are reserved to control the max # of hash probes per dictionary lookup.
	flags := params.compression_level | (int(params.flags) & ~int(4095))
	compressed := compr.compress(data, flags)!
	// header
	mut result := [
		u8(0x1f), // magic numbers (1F 8B)
		0x8b,
		0x08, // deflate
		0x00, // header flags
		0x00, // 4-byte timestamp, 0 = no timestamp (00 00 00 00)
		0x00,
		0x00,
		0x00,
		0x00, // extra flags
		0xff, // operating system id (0xff = unknown)
	] // 10 bytes
	result << compressed
	// trailer
	checksum := crc32.sum(data)
	length := data.len
	result << [
		u8(checksum),
		u8(checksum >> 8),
		u8(checksum >> 16),
		u8(checksum >> 24),
		u8(length),
		u8(length >> 8),
		u8(length >> 16),
		u8(length >> 24),
	] // 8 bytes
	return result
}

// DecompressFlags
// TODO: These flags have no use now
@[flag]
pub enum DecompressFlags {
	// If set, the input has a valid zlib header and ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the input is a raw deflate stream.
	parse_zlib_header
	// If set, there are more input bytes available beyond the end of the supplied input buffer. If clear, the input buffer contains all remaining input.
	has_more_input
	// If set, the output buffer is large enough to hold the entire decompressed stream. If clear, the output buffer is at least the size of the dictionary (typically 32KB).
	using_non_wrapping_output_buf
	// Force adler-32 checksum computation of the decompressed bytes.
	compute_adler32
}

// DecompressParams set flags for decompression:
@[params]
pub struct DecompressParams {
pub:
	verify_header_checksum bool = true
	verify_length          bool = true
	verify_checksum        bool = true
	flags                  DecompressFlags
}

pub const reserved_bits = 0b1110_0000
pub const ftext = 0b0000_0001
pub const fextra = 0b0000_0100
pub const fname = 0b0000_1000
pub const fcomment = 0b0001_0000
pub const fhcrc = 0b0000_0010

const min_header_length = 18

@[noinit]
pub struct GzipHeader {
pub mut:
	length            int = 10
	extra             []u8
	filename          []u8
	comment           []u8
	modification_time u32
	operating_system  u8
}

// validate validates the header and returns its details if valid
@[direct_array_access]
pub fn validate(data []u8, params DecompressParams) !GzipHeader {
	if data.len < min_header_length {
		return error('data is too short, not gzip compressed?')
	} else if data[0] != 0x1f || data[1] != 0x8b {
		return error('wrong magic numbers, not gzip compressed?')
	} else if data[2] != 0x08 {
		return error('gzip data is not compressed with DEFLATE')
	}
	mut header := GzipHeader{}

	// parse flags, we ignore most of them, but we still need to parse them
	// correctly, so we dont accidently decompress something that belongs
	// to the header

	if data[3] & reserved_bits > 0 {
		// rfc 1952 2.3.1.2 Compliance
		// A compliant decompressor must give an error indication if any
		// reserved bit is non-zero, since such a bit could indicate the
		// presence of a new field that would cause subsequent data to be
		// interpreted incorrectly.
		return error('reserved flags are set, unsupported field detected')
	}

	if data[3] & fextra > 0 {
		xlen := data[header.length]
		header.extra = data[header.length + 1..header.length + 1 + xlen]
		header.length += xlen + 1
	}
	if data[3] & fname > 0 {
		// filename is zero-terminated, so skip until we hit a zero byte
		for header.length < data.len && data[header.length] != 0x00 {
			header.filename << data[header.length]
			header.length++
		}
		header.length++
	}
	if data[3] & fcomment > 0 {
		// comment is zero-terminated, so skip until we hit a zero byte
		for header.length < data.len && data[header.length] != 0x00 {
			header.comment << data[header.length]
			header.length++
		}
		header.length++
	}
	if data[3] & fhcrc > 0 {
		if header.length + 12 > data.len {
			return error('data too short')
		}
		checksum_header := crc32.sum(data[..header.length])
		checksum_header_expected := (u32(data[header.length]) << 24) | (u32(data[header.length + 1]) << 16) | (u32(data[
			header.length + 2]) << 8) | data[header.length + 3]
		if params.verify_header_checksum && checksum_header != checksum_header_expected {
			return error('header checksum verification failed')
		}
		header.length += 4
	}
	if header.length + 8 > data.len {
		return error('data too short')
	}
	header.operating_system = data[9]
	return header
}

// decompress an array of bytes using zlib and returns the decompressed bytes in a new array.
// Example: b := 'abcdef'.repeat(1000).bytes(); cmpr := gzip.compress(b)!; decmpr := gzip.decompress(cmpr)!; assert cmpr.len < b.len; assert b == decmpr
pub fn decompress(data []u8, params DecompressParams) ![]u8 {
	gzip_header := validate(data, params)!
	header_length := gzip_header.length

	decompressed := compr.decompress(data[header_length..data.len - 8], 0)!
	length_expected := (u32(data[data.len - 1]) << 24) | (u32(data[data.len - 2]) << 16) | (u32(data[data.len - 3]) << 8) | data[data.len - 4]
	if params.verify_length && decompressed.len != length_expected {
		return error('length verification failed, got ${decompressed.len}, expected ${length_expected}')
	}
	checksum := crc32.sum(decompressed)
	checksum_expected := (u32(data[data.len - 5]) << 24) | (u32(data[data.len - 6]) << 16) | (u32(data[data.len - 7]) << 8) | data[data.len - 8]
	if params.verify_checksum && checksum != checksum_expected {
		return error('checksum verification failed')
	}
	return decompressed
}

// decompress_with_callback decompresses the given `data`, using zlib. It calls `cb` with each chunk of decompressed bytes.
// A chunk is usually 32 KB or less. Note: the chunk data received by `cb` should be cloned, if you need to store it for later,
// and not process it right away.
// The callback function should return the chunk length, if it wants to continue decompressing, or 0, if it wants to abort the decompression early.
// See also compress.ChunkCallback for more details.
pub fn decompress_with_callback(data []u8, cb compr.ChunkCallback, userdata voidptr, params DecompressParams) !int {
	gzip_header := validate(data, params)!
	header_len := gzip_header.length
	expected_len := int((u32(data[data.len - 1]) << 24) | (u32(data[data.len - 2]) << 16) | (u32(data[data.len - 3]) << 8) | data[data.len - 4])
	body := data[header_len..data.len - 8]
	chunks_len := int(compr.decompress_with_callback(body, cb, userdata, 0)!)
	if params.verify_length && expected_len != chunks_len {
		return error('Decompress error: expected length:${expected_len}, got:${chunks_len}')
	}
	return chunks_len
}