Source code for blockify.utilities

from functools import reduce
import numpy as np
from pybedtools import BedTool

# numpy log10 float max
FLOAT_MAX = np.finfo(np.float64).max
LOG10_FLOAT_MAX = np.log10(FLOAT_MAX)


# Fast method for getting number of lines in a file
# For BED files, much faster than calling len() on file
# From https://stackoverflow.com/questions/845058/how-to-get-line-count-cheaply-in-python
[docs]def file_len(fname):
    """Fast method for getting number of lines in a file. For BED files, much faster than calling len() on a BedTool object. From https://stackoverflow.com/questions/845058/how-to-get-line-count-cheaply-in-python

    Parameters
    ----------
    fname: str
        Input (text) filename

    Returns
    -------
    length: int
        Length of fname
    """

    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1


[docs]def getChromosomesInDF(df):
    """Helper function to get a list of unique chromsomes in a ``pandas`` DataFrame.

    Parameters
    ----------
    df: ``pandas`` DataFrame
        Input genomic data (e.g BED, qBED, CCF) as a DataFrame

    Returns
    -------
    chroms: list
        List of chromosomes
    """

    return reduce(lambda l, x: l if x in l else l + [x], df["chrom"], [])


[docs]def isSortedBEDObject(bed_object):
    """Tests whether a BedTool object is sorted.

    Parameters
    ----------
    bed_object: BedTool object
        Input data as a BedTool object

    Returns
    -------
    is_sorted: bool
    """

    # Convert BedTool object to pandas DataFrame
    df = bed_object.to_dataframe()
    # First, check that chrom is in sorted order
    if df["chrom"].is_monotonic:
        # If so, check that the start coordinates are in order
        chroms = getChromosomesInDF(df)
        for c in chroms:
            if not df[df["chrom"] == c]["start"].is_monotonic:
                return False
        return True
    return False


[docs]def isSortedBEDFile(bed_file_path):
    """Wrapper function to feed filepaths isSortedBEDObject.

    Parameters
    ----------
    bed_file_path: str
        Path to BED/qBED/CCF data file

    Returns
    -------
    is_sorted: bool
    """

    return isSortedBEDObject(BedTool(bed_file_path))