Source code for bam2html.util

# -*- coding=utf-8 -*-
import os
import re
import sys
import subprocess
from collections import defaultdict


[docs]def safe_open(filename, mode='r'):
    if 'w' in mode:
        dirname = os.path.dirname(filename)
        if dirname and not os.path.exists(dirname):
            os.makedirs(dirname)

    if filename.endswith('.gz'):
        import gzip
        return gzip.open(filename, mode=mode)

    return open(filename, mode=mode)


[docs]def run_cmd(cmd):
    print('>>> run cmd: \x1b[32m{}\x1b[0m'.format(cmd))
    res = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    stdout = res.stdout.read().decode()
    stderr = res.stderr.read().strip()

    if stdout:
        return stdout
    elif stderr:
        print('ERROR: {}'.format(stderr))
        exit(1)


[docs]def check_samtools(samtools):
    cmd = samtools + ' --version'
    res = run_cmd(cmd)
    if res:
        print('use samtools: {} (version: {})'.format(samtools, res.split('\n')[0]))
    else:
        print('samtools not exists, please check!')

        
[docs]def check_bam_index(bam, samtools):
    bam_index = bam + '.bai'
    if not os.path.exists(bam_index):
        cmd = '{samtools} index {bam}'.format(**locals())
        print('bam index not exists. creating with cmd: {}'.format(cmd))
        assert not os.system(cmd)


[docs]def samtools_tview(bam, chrom, pos, reference=None, samtools='samtools', column=None):
    cmd = '{samtools} tview -d H -p {chrom}:{pos} {bam} '
    if reference:
        cmd += reference
    if column:
        cmd = 'COLUMNS={column} ' + cmd

    cmd = cmd.format(**locals())

    
    res = run_cmd(cmd)
    if res:
        return res


[docs]def parse_position(position, column):
    """
        - chrom:pos
        - chrom:start-end
        - chrom:pos1,pos2,pos3
    """
    chrom, pos_info = position.split(':')
    if pos_info.isdigit():
        pos = int(pos_info)
        start_pos = pos - column / 2
        pos_list = [pos]
    else:
        if '-' in pos_info:
            start, end = list(map(int, pos_info.split('-')))
            pos_list = range(start, end + 1)
        elif ',' in pos_info:
            pos_list = list(map(int, pos_info.split(',')))
            start = min(pos_list)
            end = max(pos_list)
        else:
            print('bad position format, use with: chrom:pos or chrom:start-end or chrom:pos1,pos2,...')
            exit(1)

        if end - start > column:
            print('your region({start}-{end}) is larger than column({column})!'.format(**locals()))
            exit(1)

        start_pos = start - (column - end + start) / 2
        
    return chrom, start_pos, pos_list


[docs]def get_relative_index(ref_span, position, first_position):

    highlight_index = int(position) - int(first_position)
    linelist = re.split(r'<.*?>', ref_span)
    total_len = 0
    indel_len = 0
    for part in linelist:
        if part:
            if re.match(r'^\*+$', part):
                indel_len += len(part)
                continue
            total_len += len(part)
            if total_len > highlight_index:
                relative_index = highlight_index + indel_len
                return relative_index


[docs]def add_highlight_class(seq, index, n, base_stat, is_ref):
    base = seq[index]
    if not is_ref:
        base_stat[n][base.upper()] += 1

    highlight_base = '<i class="highlight" index="{}">{}</i>'.format(n, base)
    new_seq = seq[:index] + highlight_base + seq[index + 1:]

    return new_seq


[docs]def highlight_span(span, relative_index, n, base_stat, is_ref):

    linelist = re.split(r'(<.*?>)', span)

    total_len = 0
    for idx, each in enumerate(linelist):
        if not re.match(r'^<.*?>$', each):
            total_len += len(each)
            if total_len > relative_index:
                real_index = relative_index - (total_len - len(each))
                linelist[idx] = add_highlight_class(linelist[idx], real_index, n, base_stat, is_ref)
                break
    return ''.join(linelist)


[docs]def parse_html(html, pos_list, new_title, css=None):
    out_html = ''
    for line in html.strip().split('\n'):
        if not line.startswith('</style>'):
            out_html += line
            continue
        if css:
            out_html += css

        prefix = re.findall(r"(</style>.+?<pre class='tviewpre'>)", line)[0]
        out_html += prefix
        suffix = '</pre></div></body></html>'

        # the left position in html, it might be different with input when there are insertions
        origin_title = re.findall(r"<div class='tviewtitle'>(.*?:\d+?)</div>", prefix)[0]
        first_position = origin_title.split(':')[1]

        
        spans = re.findall(r'(<span.*?)<br/>', line)
        if len(spans) == 1:
            print('no reads covered')
            return html

        # the last span has no <br/>
        last_span = re.findall(r'<br/>(<span[^(br)]*?</span>)</pre>', line)[-1]

        # the first span is position information
        position_line = spans[0] + '<br/>'
        out_html += position_line

        # ref_span might exist *(insertion)
        ref_span = spans[1]
        relative_indexes = [get_relative_index(ref_span, pos, first_position) for pos in pos_list]

        new_spans = []
        base_stat = {}
        for i, span in enumerate(spans[1:] + [last_span]):
            is_ref = True if i == 0 else False
            for n, relative_index in enumerate(relative_indexes, 1):
                if n not in base_stat:
                    base_stat[n] = defaultdict(int)
                span = highlight_span(span, relative_index, n, base_stat, is_ref)
            new_spans.append(span)

        out_html += '<br/>'.join(new_spans) + '<br/>'
        for n in range(1, len(relative_indexes) + 1):
            for base in ('A', 'T', 'C', 'G'):
                if base not in base_stat[n]:
                    base_stat[n][base] = 0
            total = sum(v for k,v in base_stat[n].items() if k != ' ')
            base_stat[n]['deletion'] = base_stat[n]['*']
            base_stat[n]['total'] = total
            # print(base_stat[n])
            title = 'title="total reads: {total}&#10;mutations: A={A} T={T} C={C} G={G}&#10;deletions: {deletion}"'
            title = title.format(**base_stat[n])
            # print(title)
            out_html = out_html.replace('index="{}"'.format(n), title)
    
    # remove the last span's <br/>
    out_html = out_html[:-5]
    out_html += suffix
    out_html = out_html.replace(origin_title, new_title)

    return out_html




if __name__ == '__main__':
    
    # print(parse_position('1:100', 80))
    # print(parse_position('1:100', 200))
    # print(parse_position('chr1:100', 80))

    # print(parse_position('1:100-105', 80))
    # print(parse_position('1:100-200', 80))


    bam = 'example/demo.1_985460.bam'
    position = '1:985460'
    position = '1:985460-985463'
    column = 500

    chrom, start_pos, pos_list = parse_position(position, column)
    print(position)
    print(chrom, start_pos)
    print(pos_list)

    color = 'red'
    background = 'yellow'
    bold = 'none'
    highlight_css = '.highlight {{color:{color};background:{background};font-weight:{bold};font-style:normal;}}\n'.format(**locals())

    html = samtools_tview(bam, chrom, start_pos, column=column)
    out_html = parse_html(html, pos_list, position, css=highlight_css)
    with open('out.html', 'w') as out:
        out.write(out_html)