Skip to content

Commit 46ec5e0

Browse files
authored
Merge pull request #22516 from wm75/improve-bam-chunked-display
Fix display of BAMs with large headers
2 parents 3d2a0dd + 550b9d9 commit 46ec5e0

1 file changed

Lines changed: 30 additions & 25 deletions

File tree

lib/galaxy/datatypes/binary.py

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -771,33 +771,38 @@ def get_chunk(self, trans, dataset: HasFileName, offset: int = 0, ck_size: Optio
771771
with pysam.AlignmentFile(dataset.get_file_name(), "rb", check_sq=False) as bamfile:
772772
if ck_size is None:
773773
ck_size = 300 # 300 lines
774-
if offset == 0:
775-
offset = bamfile.tell()
776-
ck_lines = bamfile.text.strip().replace("\t", " ").splitlines() # type: ignore[attr-defined]
774+
if offset < bamfile.tell():
775+
# interpret an offset before the first alignment start as the index of
776+
# the header line at which the chunk should start
777+
header_lines = bamfile.text.strip().replace("\t", " ").splitlines() # type: ignore[attr-defined]
778+
ck_lines = header_lines[offset : offset + ck_size]
779+
offset += len(ck_lines)
780+
if offset >= len(header_lines):
781+
# consumed the entire header, now jump forward to the first alignment
782+
offset = bamfile.tell()
777783
else:
778-
bamfile.seek(offset)
779784
ck_lines = []
780-
for line_number, alignment in enumerate(bamfile, len(ck_lines)):
781-
# return only Header lines if 'header_line_count' exceeds 'ck_size'
782-
# FIXME: Can be problematic if bam has million lines of header
783-
if line_number >= ck_size:
784-
break
785-
786-
offset = bamfile.tell()
787-
bamline = alignment.to_string()
788-
# With multiple tags, Galaxy would display each as a separate column
789-
# because the 'to_string()' function uses tabs also between tags.
790-
# Below code will turn these extra tabs into spaces.
791-
n_tabs = bamline.count("\t")
792-
if n_tabs > 11:
793-
bamline, *extra_tags = bamline.rsplit("\t", maxsplit=n_tabs - 11)
794-
bamline = f"{bamline} {' '.join(extra_tags)}"
795-
ck_lines.append(bamline)
796-
else:
797-
# Nothing to enumerate; we've either offset to the end
798-
# of the bamfile, or there is no data. (possible with
799-
# header-only bams)
800-
offset = -1
785+
if len(ck_lines) < ck_size:
786+
bamfile.seek(offset)
787+
for line_number, alignment in enumerate(bamfile, len(ck_lines)):
788+
if line_number >= ck_size:
789+
break
790+
791+
offset = bamfile.tell()
792+
bamline = alignment.to_string()
793+
# With multiple tags, Galaxy would display each as a separate column
794+
# because the 'to_string()' function uses tabs also between tags.
795+
# Below code will turn these extra tabs into spaces.
796+
n_tabs = bamline.count("\t")
797+
if n_tabs > 11:
798+
bamline, *extra_tags = bamline.rsplit("\t", maxsplit=n_tabs - 11)
799+
bamline = f"{bamline} {' '.join(extra_tags)}"
800+
ck_lines.append(bamline)
801+
else:
802+
# Nothing to enumerate; we've either offset to the end
803+
# of the bamfile, or there is no data. (possible with
804+
# header-only bams)
805+
offset = -1
801806
ck_data = "\n".join(ck_lines)
802807
except Exception as e:
803808
offset = -1

0 commit comments

Comments
 (0)