from functools import reduce
import numpy as np
from pybedtools import BedTool
# numpy log10 float max
FLOAT_MAX = np.finfo(np.float64).max
LOG10_FLOAT_MAX = np.log10(FLOAT_MAX)
# Fast method for getting number of lines in a file
# For BED files, much faster than calling len() on file
# From https://stackoverflow.com/questions/845058/how-to-get-line-count-cheaply-in-python
[docs]def file_len(fname):
"""Fast method for getting number of lines in a file. For BED files, much faster than calling len() on a BedTool object. From https://stackoverflow.com/questions/845058/how-to-get-line-count-cheaply-in-python
Parameters
----------
fname: str
Input (text) filename
Returns
-------
length: int
Length of fname
"""
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
[docs]def getChromosomesInDF(df):
"""Helper function to get a list of unique chromsomes in a ``pandas`` DataFrame.
Parameters
----------
df: ``pandas`` DataFrame
Input genomic data (e.g BED, qBED, CCF) as a DataFrame
Returns
-------
chroms: list
List of chromosomes
"""
return reduce(lambda l, x: l if x in l else l + [x], df["chrom"], [])
[docs]def isSortedBEDObject(bed_object):
"""Tests whether a BedTool object is sorted.
Parameters
----------
bed_object: BedTool object
Input data as a BedTool object
Returns
-------
is_sorted: bool
"""
# Convert BedTool object to pandas DataFrame
df = bed_object.to_dataframe()
# First, check that chrom is in sorted order
if df["chrom"].is_monotonic:
# If so, check that the start coordinates are in order
chroms = getChromosomesInDF(df)
for c in chroms:
if not df[df["chrom"] == c]["start"].is_monotonic:
return False
return True
return False
[docs]def isSortedBEDFile(bed_file_path):
"""Wrapper function to feed filepaths isSortedBEDObject.
Parameters
----------
bed_file_path: str
Path to BED/qBED/CCF data file
Returns
-------
is_sorted: bool
"""
return isSortedBEDObject(BedTool(bed_file_path))