bx-python / lib / bx / intervals / operations /

Concatenate sets of intervals. 

Preserves format of the first input -- it is possible to concat two files that
have different column orders. Of course, the meta-data of the second will be
lost (and filled with a "."). If all of the files (GenomicInteralReaders) are
the same format, sameformat=True will preserve all columns of the first input,
cuts extra columns on subsequent input, and pads missing columns. If
sameformat=False then extra columns are filled with ".".

import psyco_full

import traceback
import fileinput
from warnings import warn

from import *
from bx.intervals.operations import *

def concat(readers, comments=True, header=True, sameformat=True):
    # Save columns from the first input
    chrom_col = readers[0].chrom_col
    start_col = readers[0].start_col
    end_col = readers[0].end_col
    strand_col = readers[0].strand_col
    nfields = None
    firstdataset = True
    output = False
    for intervals in readers:
        for interval in intervals:
            if isinstance(interval, GenomicInterval):
                if not nfields: nfields = interval.nfields
                out_interval = interval.copy()
                if sameformat or firstdataset:
                    # everything except the first input has to be
                    # trimmed or padded to match the first input
                    if len(out_interval.fields) > nfields:
                        out_interval.fields = out_interval.fields[0:nfields]
                        while len(out_interval.fields) < nfields:
                    output = True
                    yield out_interval
                    chrom = out_interval.chrom
                    start = out_interval.start
                    end = out_interval.end
                    strand = out_interval.strand
                    out_interval.fields = ["." for col in range(nfields)]  
                    out_interval.fields[chrom_col] = chrom
                    out_interval.fields[start_col] = str(start)
                    out_interval.fields[end_col] = str(end)
                    # Strand is optional, might not exist in output
                    if strand_col < len( out_interval.fields ):
                        out_interval.fields[strand_col] = strand
                    yield out_interval
            elif isinstance(interval, Header) and header:
                yield interval
            elif isinstance(interval, Comment) and comments:
                yield interval
        if output and firstdataset: firstdataset = False