Skip to content

Commit da143f1

Browse files
authored
Merge pull request #85 from ArcInstitute/binseq-0.9.0
Binseq 0.9.0
2 parents 54dc048 + cdc5b7a commit da143f1

46 files changed

Lines changed: 8072 additions & 1539 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci.yml

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,41 +26,44 @@ jobs:
2626
- name: Linting
2727
run: cargo clippy --verbose
2828

29-
example_read_write:
30-
runs-on: ubuntu-latest
31-
steps:
32-
- uses: actions/checkout@v3
33-
- name: run example
34-
run: cargo run --release --example read_write
35-
36-
example_parallel:
29+
example_grep:
3730
runs-on: ubuntu-latest
31+
strategy:
32+
matrix:
33+
ext: [bq, vbq, cbq]
3834
steps:
3935
- uses: actions/checkout@v3
40-
- name: run example
41-
run: cargo run --release --example parallel_processing
36+
- name: run example ${{ matrix.ext }}
37+
run: cargo run --release --example grep -- ./data/subset.${{ matrix.ext }} "ACGTACGT"
4238

43-
example_example:
39+
example_range:
4440
runs-on: ubuntu-latest
41+
strategy:
42+
matrix:
43+
ext: [bq, vbq, cbq]
4544
steps:
4645
- uses: actions/checkout@v3
47-
- name: run example
48-
run: cargo run --release --example example
46+
- name: run example ${{ matrix.ext }}
47+
run: cargo run --release --example parallel_range -- ./data/subset.${{ matrix.ext }} 4 30 200
4948

50-
example_grep:
49+
example_write:
5150
runs-on: ubuntu-latest
51+
strategy:
52+
matrix:
53+
ext: [bq, vbq, cbq]
5254
steps:
5355
- uses: actions/checkout@v3
54-
- name: run example bq
55-
run: cargo run --release --example grep ./data/subset.bq
56-
- name: run example vbq
57-
run: cargo run --release --example grep ./data/subset.vbq
56+
- name: run example (single) ${{ matrix.ext }}
57+
run: cargo run --release --example write -- ./data/subset_R1.fastq.gz -o ./output.${{ matrix.ext }}
58+
- name: run example (paired) ${{ matrix.ext }}
59+
run: cargo run --release --example write -- ./data/subset_R1.fastq.gz ./data/subset_R2.fastq.gz -o ./output.${{ matrix.ext }}
5860

59-
example_range:
61+
example_read:
6062
runs-on: ubuntu-latest
63+
strategy:
64+
matrix:
65+
ext: [bq, vbq, cbq]
6166
steps:
6267
- uses: actions/checkout@v3
63-
- name: run example (bq)
64-
run: cargo run --release --example parallel_range -- ./data/subset.bq 4 30 200
65-
- name: run example (vbq)
66-
run: cargo run --release --example parallel_range -- ./data/subset.vbq 4 30 200
68+
- name: run example ${{ matrix.ext }}
69+
run: cargo run --release --example read -- ./data/subset.${{ matrix.ext }}

Cargo.toml

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
name = "binseq"
33
version = "0.8.3"
4-
edition = "2021"
4+
edition = "2024"
55
description = "A high efficiency binary format for sequencing data"
66
license = "MIT"
77
authors = ["Noam Teyssier <noam.teyssier@arcinstitute.org>"]
@@ -11,25 +11,32 @@ categories = ["science::bioinformatics", "encoding", "data-structures"]
1111
keywords = ["bioinformatics", "nucleotide", "sequencing", "genomics", "fastq"]
1212

1313
[dependencies]
14-
anyhow = "1.0.100"
14+
anyhow = {version = "1.0.100", optional = true}
1515
auto_impl = "1.3.0"
16-
bitnuc = "0.3.2"
17-
bytemuck = "1.24.0"
16+
bitnuc = "0.4.0"
17+
bytemuck = { version = "1.24.0", features = ["derive", "extern_crate_alloc"] }
1818
byteorder = "1.5.0"
19-
itoa = "1.0.15"
19+
itoa = "1.0.17"
20+
memchr = "2.7.6"
2021
memmap2 = "0.9.9"
2122
num_cpus = "1.17.0"
23+
paraseq = { version = "0.4.8", optional = true }
24+
parking_lot = {version = "0.12.5", optional = true }
2225
rand = { version = "0.9.2", features = ["small_rng"] }
26+
sucds = "0.8.3"
2327
thiserror = "2.0.17"
2428
zstd = { version = "0.13.3", features = ["zstdmt"] }
2529

2630
[dev-dependencies]
27-
nucgen = "0.2.0"
28-
niffler = "3.0.0"
29-
seq_io = "0.3.4"
31+
anyhow = "1.0.100"
3032
parking_lot = "0.12.5"
31-
itoa = "1.0.15"
32-
memchr = "2.7.6"
33+
clap = { version = "4.5.54", features = ["derive"] }
34+
paraseq = "0.4.8"
35+
36+
[features]
37+
default = ["paraseq", "anyhow"]
38+
anyhow = ["dep:anyhow"]
39+
paraseq = ["dep:paraseq", "dep:parking_lot"]
3340

3441
[lints.clippy]
3542
pedantic = { level = "warn", priority = -1 }

README.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,18 @@
1010
BINSEQ is a binary file format family designed for efficient storage and processing of DNA sequences.
1111
They make use of two-bit encoding for nucleotides and are optimized for high-performance parallel processing.
1212

13-
BINSEQ currently has two flavors:
13+
BINSEQ has three variants:
1414

1515
1. **BQ**: (`*.bq`) files are for _fixed-length_ records **without** quality scores.
1616
2. **VBQ**: (`*.vbq`) files are for _variable-length_ records **with optional** quality scores and headers.
17+
3. **CBQ**: (`*.cbq`) files are for _columnar variable-length_ records **with optional** quality scores and headers.
1718

18-
Both flavors support both single and paired sequences.
19+
All variants support both single and paired sequences.
20+
21+
**Note:** For most use cases, the newest variant _CBQ_ is recommended due to its flexibility, storage efficiency, and decoding speed.
22+
It supersedes _VBQ_ in terms of performance and storage efficiency, at a small cost in encoding speed.
23+
VBQ will still be supported but newer projects should consider using _CBQ_ instead.
24+
For information on the structure of _CBQ_ files, see the [documentation](https://docs.rs/binseq/latest/binseq/cbq/).
1925

2026
## Getting Started
2127

@@ -24,4 +30,4 @@ This is a **library** for reading and writing BINSEQ files, for a **command-line
2430
To get started please refer to our [documentation](https://docs.rs/binseq/latest/binseq/).
2531
For example programs which make use of the library check out our [examples directory](https://github.com/arcinstitute/binseq/tree/main/examples).
2632

27-
For more information about the BINSEQ file family, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1).
33+
For more information about the BINSEQ file family, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v2).

data/subset.cbq

775 KB
Binary file not shown.

data/subset.vbq

360 KB
Binary file not shown.

examples/auto-write.rs

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
use std::{fs::File, io::BufWriter};
2+
3+
use anyhow::Result;
4+
use binseq::{BinseqWriterBuilder, write::Format};
5+
use bitnuc::BitSize;
6+
use clap::Parser;
7+
8+
type BoxedWriter = Box<dyn std::io::Write + Send>;
9+
10+
#[derive(Parser)]
11+
struct Args {
12+
/// Input FASTX to encode into BINSEQ format
13+
#[clap(required = true)]
14+
input: String,
15+
16+
/// Input FASTX to encode into BINSEQ format (R2)
17+
#[clap(required = false)]
18+
input2: Option<String>,
19+
20+
/// Output file path for BINSEQ format
21+
#[clap(short = 'o', long)]
22+
output: Option<String>,
23+
24+
/// Default prefix for writing BINSEQ: `<prefix>.<ext>`
25+
#[clap(short = 'p', long, default_value = "output")]
26+
prefix: String,
27+
28+
/// Format of the output BINSEQ file
29+
///
30+
/// [bq: bq|BQ|b, vbq: vbq|VBQ|v, cbq: cbq|CBQ|c]
31+
#[clap(short = 'f', long)]
32+
format: Option<Format>,
33+
34+
/// Exclude quality information in BINSEQ output
35+
///
36+
/// (bq ignores quality always)
37+
#[clap(short = 'Q', long)]
38+
exclude_quality: bool,
39+
40+
/// Exclude sequence headers in BINSEQ output
41+
///
42+
/// (bq ignores headers always)
43+
#[clap(short = 'H', long)]
44+
exclude_headers: bool,
45+
46+
/// Compression level for BINSEQ output (0: auto)
47+
#[clap(long, default_value_t = 0)]
48+
compression_level: i32,
49+
50+
/// Default BITSIZE for BINSEQ output (2: 2bit, 4: 4bit)
51+
#[clap(long, default_value_t = 2)]
52+
bitsize: u8,
53+
54+
/// Default BLOCKSIZE in KB for BINSEQ output (vbq,cbq)
55+
#[clap(long, default_value_t = 128)]
56+
blocksize: usize,
57+
58+
/// Number of threads to use for parallel processing, 0: all available
59+
#[clap(short = 'T', long, default_value = "0")]
60+
threads: usize,
61+
}
62+
impl Args {
63+
/// Determines the output format based on the file extension or the provided format
64+
fn format(&self) -> Format {
65+
if let Some(format) = self.format {
66+
format
67+
} else {
68+
if let Some(output) = &self.output {
69+
match output.split(".").last() {
70+
Some("bq") => Format::Bq,
71+
Some("vbq") => Format::Vbq,
72+
Some("cbq") => Format::Cbq,
73+
_ => Format::default(),
74+
}
75+
} else {
76+
Format::default()
77+
}
78+
}
79+
}
80+
fn bitsize(&self) -> BitSize {
81+
match self.bitsize {
82+
4 => BitSize::Four,
83+
_ => BitSize::Two,
84+
}
85+
}
86+
87+
/// Creates an output file handle
88+
fn ohandle(&self) -> Result<BoxedWriter> {
89+
let path = if let Some(output) = &self.output {
90+
output.to_string()
91+
} else {
92+
format!("{}{}", &self.prefix, self.format().extension())
93+
};
94+
let ofile = File::create(path).map(BufWriter::new)?;
95+
Ok(Box::new(ofile))
96+
}
97+
98+
fn is_paired(&self) -> bool {
99+
self.input2.is_some()
100+
}
101+
}
102+
103+
fn main() -> Result<()> {
104+
let args = Args::parse();
105+
let handle = args.ohandle()?;
106+
let builder = BinseqWriterBuilder::new(args.format())
107+
.bitsize(args.bitsize())
108+
.block_size(args.blocksize * 1024)
109+
.headers(!args.exclude_headers)
110+
.quality(!args.exclude_quality)
111+
.compression_level(args.compression_level)
112+
.encode_fastx(handle);
113+
if args.is_paired() {
114+
builder.input_paired(&args.input, args.input2.as_ref().unwrap())
115+
} else {
116+
builder.input(&args.input)
117+
}
118+
.threads(args.threads)
119+
.run()?;
120+
121+
Ok(())
122+
}

examples/grep.rs

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
use std::sync::Arc;
22

33
use anyhow::Result;
4-
use binseq::{context::SeqCtx, prelude::*};
4+
use binseq::prelude::*;
5+
use clap::Parser;
56
use memchr::memmem::Finder;
67
use parking_lot::Mutex;
78

89
#[derive(Clone)]
910
pub struct GrepCounter {
1011
// (thread) local variables
11-
ctx: SeqCtx,
1212
local_count: usize,
1313

1414
// search pattern (using memchr::memmem::Finder for fast searching)
@@ -21,7 +21,6 @@ impl GrepCounter {
2121
#[must_use]
2222
pub fn new(pattern: &[u8]) -> Self {
2323
Self {
24-
ctx: SeqCtx::default(),
2524
pattern: Finder::new(pattern).into_owned(),
2625
local_count: 0,
2726
count: Arc::new(Mutex::new(0)),
@@ -38,9 +37,7 @@ impl GrepCounter {
3837
}
3938
impl ParallelProcessor for GrepCounter {
4039
fn process_record<R: binseq::BinseqRecord>(&mut self, record: R) -> binseq::Result<()> {
41-
self.ctx.fill(&record)?;
42-
43-
if self.match_sequence(&self.ctx.sbuf()) || self.match_sequence(&self.ctx.xbuf()) {
40+
if self.match_sequence(&record.sseq()) || self.match_sequence(&record.xseq()) {
4441
self.local_count += 1;
4542
}
4643

@@ -54,21 +51,26 @@ impl ParallelProcessor for GrepCounter {
5451
}
5552
}
5653

57-
fn main() -> Result<()> {
58-
let path = std::env::args()
59-
.nth(1)
60-
.unwrap_or("./data/subset.bq".to_string());
61-
let pattern = std::env::args()
62-
.nth(2)
63-
.unwrap_or("ACGT".to_string())
64-
.as_bytes()
65-
.to_vec();
66-
let n_threads = std::env::args().nth(3).unwrap_or("1".to_string()).parse()?;
54+
#[derive(Parser)]
55+
struct Args {
56+
/// Input BINSEQ path to grep
57+
#[clap(required = true)]
58+
input: String,
6759

68-
let reader = BinseqReader::new(&path)?;
69-
let counter = GrepCounter::new(&pattern);
70-
reader.process_parallel(counter.clone(), n_threads)?;
71-
counter.pprint();
60+
/// Pattern to search for (either sseq or xseq)
61+
#[clap(required = true)]
62+
pattern: String,
7263

64+
/// Threads to use [0: auto]
65+
#[clap(short = 'T', long, default_value_t = 0)]
66+
threads: usize,
67+
}
68+
69+
fn main() -> Result<()> {
70+
let args = Args::parse();
71+
let reader = BinseqReader::new(&args.input)?;
72+
let counter = GrepCounter::new(args.pattern.as_bytes());
73+
reader.process_parallel(counter.clone(), args.threads)?;
74+
counter.pprint();
7375
Ok(())
7476
}

examples/network_streaming.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@ use std::io::{BufReader, BufWriter};
22
use std::net::{TcpListener, TcpStream};
33
use std::thread;
44

5-
use binseq::bq::{BinseqHeader, BinseqHeaderBuilder, StreamReader, StreamWriterBuilder};
5+
use binseq::bq::{FileHeader, FileHeaderBuilder, StreamReader, StreamWriterBuilder};
66
use binseq::{BinseqRecord, Policy, Result};
77

8-
fn server(header: BinseqHeader, sequence: &[u8]) -> Result<()> {
8+
fn server(header: FileHeader, sequence: &[u8]) -> Result<()> {
99
// Create a listener on localhost:3000
1010
let listener = TcpListener::bind("127.0.0.1:3000").expect("Failed to bind to address");
1111
println!("Server listening on 127.0.0.1:3000");
@@ -25,6 +25,7 @@ fn server(header: BinseqHeader, sequence: &[u8]) -> Result<()> {
2525

2626
// Write sequences in a loop
2727
for i in 0..10 {
28+
#[allow(deprecated)]
2829
writer.write_record(Some(i), sequence)?;
2930
println!("Server: Sent record {i}");
3031

@@ -79,7 +80,7 @@ fn client() -> Result<()> {
7980

8081
fn main() -> Result<()> {
8182
// Create a header for sequences of length 100
82-
let header = BinseqHeaderBuilder::new().slen(100).build()?;
83+
let header = FileHeaderBuilder::new().slen(100).build()?;
8384

8485
// Create some example sequence data
8586
let sequence = b"ACGT".repeat(25); // 100 nucleotides

0 commit comments

Comments
 (0)