Skip to content

Commit 92c06e1

Browse files
committed
Fix adaptive XZ dictionary size for real this time
Forgot to `.rewind()`, as one does. Added a handful of Anyhow `.with_context()` calls to help prevent future goose chases.
1 parent dd927ab commit 92c06e1

File tree

1 file changed

+48
-18
lines changed

1 file changed

+48
-18
lines changed

src/recompress.rs

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@
1616
const MAX_XZ_DICTSIZE: u32 = 128 * 1024 * 1024;
1717

1818
use crate::Context;
19+
use anyhow::Context as _;
1920
use std::convert::TryFrom;
2021
use std::fmt::Write as FmtWrite;
2122
use std::fs::{self, File};
22-
use std::io::{self, Read, Write};
23+
use std::io::{self, Read, Seek, Write};
2324
use std::path::Path;
2425
use std::time::{Duration, Instant};
2526
use xz2::read::XzDecoder;
@@ -34,7 +35,7 @@ pub(crate) fn recompress_file(
3435
let file_start = Instant::now();
3536
let gz_path = xz_path.with_extension("gz");
3637

37-
let mut in_file = File::open(xz_path)?;
38+
let mut in_file = File::open(xz_path).with_context(|| "failed to open XZ-compressed input")?;
3839
let mut dec_buf = vec![0u8; 4 * 1024 * 1024];
3940
let mut compression_times = String::new();
4041

@@ -48,11 +49,11 @@ pub(crate) fn recompress_file(
4849
dec_measurements = Some(decompress_and_write(
4950
&mut in_file,
5051
&mut dec_buf,
51-
&mut [&mut gz_encoder],
52+
&mut [("gz", &mut gz_encoder)],
5253
&mut gz_duration,
5354
)?);
5455
let [gz_duration] = gz_duration;
55-
format_compression_time(&mut compression_times, "gz", gz_duration)?;
56+
format_compression_time(&mut compression_times, "gz", gz_duration, None)?;
5657
};
5758

5859
// xz recompression with more aggressive settings than we want to take the time
@@ -71,13 +72,13 @@ pub(crate) fn recompress_file(
7172
Some((_, size)) => size,
7273
None => measure_compressed_file(&mut in_file, &mut dec_buf)?.1,
7374
};
74-
let in_size = u32::try_from(in_size).unwrap_or(u32::MAX);
75+
let dictsize = choose_xz_dictsize(u32::try_from(in_size).unwrap_or(u32::MAX));
7576

7677
let mut filters = xz2::stream::Filters::new();
7778
let mut lzma_ops = xz2::stream::LzmaOptions::new_preset(9).unwrap();
7879
// This sets the overall dictionary size, which is also how much memory (baseline)
7980
// is needed for decompression.
80-
lzma_ops.dict_size(choose_xz_dictsize(in_size));
81+
lzma_ops.dict_size(dictsize);
8182
// Use the best match finder for compression ratio.
8283
lzma_ops.match_finder(xz2::stream::MatchFinder::BinaryTree4);
8384
lzma_ops.mode(xz2::stream::Mode::Normal);
@@ -105,11 +106,11 @@ pub(crate) fn recompress_file(
105106
dec_measurements = Some(decompress_and_write(
106107
&mut in_file,
107108
&mut dec_buf,
108-
&mut [&mut xz_encoder],
109+
&mut [("xz", &mut xz_encoder)],
109110
&mut xz_duration,
110111
)?);
111112
let [xz_duration] = xz_duration;
112-
format_compression_time(&mut compression_times, "xz", xz_duration)?;
113+
format_compression_time(&mut compression_times, "xz", xz_duration, Some(dictsize))?;
113114
}
114115

115116
drop(in_file);
@@ -136,17 +137,20 @@ pub(crate) fn recompress_file(
136137
/// durations and returns the total time taken by the decompressor and the total
137138
/// size of the decompressed stream.
138139
fn decompress_and_write(
139-
src: &mut impl Read,
140+
src: &mut (impl Read + Seek),
140141
buf: &mut [u8],
141-
destinations: &mut [&mut dyn Write],
142+
destinations: &mut [(&str, &mut dyn Write)],
142143
time_by_dst: &mut [Duration],
143-
) -> io::Result<(Duration, u64)> {
144+
) -> anyhow::Result<(Duration, u64)> {
145+
src.rewind().with_context(|| "input file seek failed")?;
144146
let mut decompressor = XzDecoder::new(src);
145147
let mut decompress_time = Duration::ZERO;
146148
let mut total_length = 0_u64;
147149
loop {
148150
let start = Instant::now();
149-
let length = decompressor.read(buf)?;
151+
let length = decompressor
152+
.read(buf)
153+
.with_context(|| "XZ decompression failed")?;
150154
decompress_time += start.elapsed();
151155
total_length += length as u64;
152156
if length == 0 {
@@ -155,9 +159,11 @@ fn decompress_and_write(
155159
// This code assumes that compression with `write_all` will never fail (i.e.,
156160
// we can take arbitrary amounts of data as input). That seems like a
157161
// reasonable assumption though.
158-
for (idx, destination) in destinations.iter_mut().enumerate() {
162+
for (idx, (compname, destination)) in destinations.iter_mut().enumerate() {
159163
let start = std::time::Instant::now();
160-
destination.write_all(&buf[..length])?;
164+
destination
165+
.write_all(&buf[..length])
166+
.with_context(|| format!("{compname} compression failed"))?;
161167
time_by_dst[idx] += start.elapsed();
162168
}
163169
}
@@ -166,12 +172,33 @@ fn decompress_and_write(
166172

167173
/// Calls `decompress_and_write` solely to measure the file's uncompressed size
168174
/// and the time taken by decompression.
169-
fn measure_compressed_file(src: &mut impl Read, buf: &mut [u8]) -> io::Result<(Duration, u64)> {
175+
fn measure_compressed_file(
176+
src: &mut (impl Read + Seek),
177+
buf: &mut [u8],
178+
) -> anyhow::Result<(Duration, u64)> {
170179
decompress_and_write(src, buf, &mut [], &mut [])
171180
}
172181

173-
fn format_compression_time(out: &mut String, name: &str, duration: Duration) -> std::fmt::Result {
174-
write!(out, ", {:.2?} {} compression", duration, name)
182+
fn format_compression_time(
183+
out: &mut String,
184+
name: &str,
185+
duration: Duration,
186+
dictsize: Option<u32>,
187+
) -> std::fmt::Result {
188+
write!(out, ", {:.2?} {} compression", duration, name)?;
189+
if let Some(mut dictsize) = dictsize {
190+
let mut iprefix = 0;
191+
while iprefix < 2 && dictsize & 1023 == 0 {
192+
iprefix += 1;
193+
dictsize >>= 10;
194+
}
195+
write!(
196+
out,
197+
" with {dictsize} {}B dictionary",
198+
["", "Ki", "Mi"][iprefix]
199+
)?;
200+
}
201+
Ok(())
175202
}
176203

177204
/// Chooses the smallest XZ dictionary size that is at least as large as the
@@ -275,7 +302,10 @@ impl Context {
275302
let path = to_recompress.lock().unwrap().pop();
276303
path
277304
} {
278-
recompress_file(&xz_path, recompress_gz, compression_level, recompress_xz)?;
305+
recompress_file(&xz_path, recompress_gz, compression_level, recompress_xz)
306+
.with_context(|| {
307+
format!("failed to recompress {}", xz_path.display())
308+
})?;
279309
}
280310

281311
Ok::<_, anyhow::Error>(())

0 commit comments

Comments
 (0)