@@ -771,33 +771,38 @@ def get_chunk(self, trans, dataset: HasFileName, offset: int = 0, ck_size: Optio
771771 with pysam .AlignmentFile (dataset .get_file_name (), "rb" , check_sq = False ) as bamfile :
772772 if ck_size is None :
773773 ck_size = 300 # 300 lines
774- if offset == 0 :
775- offset = bamfile .tell ()
776- ck_lines = bamfile .text .strip ().replace ("\t " , " " ).splitlines () # type: ignore[attr-defined]
774+ if offset < bamfile .tell ():
775+ # interpret an offset before the first alignment start as the index of
776+ # the header line at which the chunk should start
777+ header_lines = bamfile .text .strip ().replace ("\t " , " " ).splitlines () # type: ignore[attr-defined]
778+ ck_lines = header_lines [offset : offset + ck_size ]
779+ offset += len (ck_lines )
780+ if offset >= len (header_lines ):
781+ # consumed the entire header, now jump forward to the first alignment
782+ offset = bamfile .tell ()
777783 else :
778- bamfile .seek (offset )
779784 ck_lines = []
780- for line_number , alignment in enumerate ( bamfile , len (ck_lines )) :
781- # return only Header lines if 'header_line_count' exceeds 'ck_size'
782- # FIXME: Can be problematic if bam has million lines of header
783- if line_number >= ck_size :
784- break
785-
786- offset = bamfile .tell ()
787- bamline = alignment .to_string ()
788- # With multiple tags, Galaxy would display each as a separate column
789- # because the 'to_string()' function uses tabs also between tags.
790- # Below code will turn these extra tabs into spaces.
791- n_tabs = bamline .count ("\t " )
792- if n_tabs > 11 :
793- bamline , * extra_tags = bamline .rsplit ("\t " , maxsplit = n_tabs - 11 )
794- bamline = f"{ bamline } { ' ' .join (extra_tags )} "
795- ck_lines .append (bamline )
796- else :
797- # Nothing to enumerate; we've either offset to the end
798- # of the bamfile, or there is no data. (possible with
799- # header-only bams)
800- offset = - 1
785+ if len (ck_lines ) < ck_size :
786+ bamfile . seek ( offset )
787+ for line_number , alignment in enumerate ( bamfile , len ( ck_lines )):
788+ if line_number >= ck_size :
789+ break
790+
791+ offset = bamfile .tell ()
792+ bamline = alignment .to_string ()
793+ # With multiple tags, Galaxy would display each as a separate column
794+ # because the 'to_string()' function uses tabs also between tags.
795+ # Below code will turn these extra tabs into spaces.
796+ n_tabs = bamline .count ("\t " )
797+ if n_tabs > 11 :
798+ bamline , * extra_tags = bamline .rsplit ("\t " , maxsplit = n_tabs - 11 )
799+ bamline = f"{ bamline } { ' ' .join (extra_tags )} "
800+ ck_lines .append (bamline )
801+ else :
802+ # Nothing to enumerate; we've either offset to the end
803+ # of the bamfile, or there is no data. (possible with
804+ # header-only bams)
805+ offset = - 1
801806 ck_data = "\n " .join (ck_lines )
802807 except Exception as e :
803808 offset = - 1
0 commit comments