Source code for blockify.normalization

from pybedtools import BedTool
from . import segmentation
from . import utilities
import warnings

# Generates a normalized bedgraph of event rate for BED-formatted genomic data.
# The output file can be visualized on genome browsers.

# Suppress certain warnings
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter("ignore", category=ResourceWarning)


[docs]def validateNormalizationArguments(input_file, regions_bed, libraryFactor, lengthFactor):
    """Validates parameters passed via the command line.

    Parameters
    ----------
    input_file: BedTool object
        BedTool object (instantiated from pybedtools) for input data
    regions_bed: BedTool object
        BedTool object (instantiated from pybedtools) for regions over which we are normalizing input_file
    libraryFactor: float
        Scalar to normalize by input_file's library size.
    lengthFactor: float or None
        Scalar to normalize by each block's length. If None, no length normalization is performed.

    Returns
    -------
    None: None
    """

    # Check that the input file is sorted
    assert utilities.isSortedBEDObject(input_file), "input file must be sorted"
    # Check that regions is also sorted
    assert utilities.isSortedBEDObject(regions_bed), "regions BED file must be sorted"
    # Check libraryFactor
    assert libraryFactor > 0, "--libraryFactor should be a positive number"
    # Check lengthFactor
    if lengthFactor is not None:
        assert lengthFactor > 0, "--lengthFactor should be a positive number"


[docs]def normalize(input_file, regions_bed, libraryFactor, lengthFactor):
    """Core normalization method

    Parameters
    ----------
    input_file: BedTool object
        BedTool object (instantiated from pybedtools) for input data
    regions_bed: BedTool object
        BedTool object (instantiated from pybedtools) for regions over which we are normalizing input_file
    libraryFactor: float
        Scalar to normalize by input_file's library size.
    lengthFactor: float or None
        Scalar to normalize by each block's length. If None, no length normalization is performed.

    Returns
    -------
    bedgraph: BedTool
        A BedTool object in bedGraph format, using the intervals supplied in regions_bed
    """

    # input_file and regions_bed are BedTool objects
    # Calculate library scaling constant, which is the total number
    # Validate normalization arguments
    validateNormalizationArguments(input_file, regions_bed, libraryFactor, lengthFactor)
    # of events in input BED divided by the library factor
    library_scaling_constant = len(input_file.to_dataframe()) / libraryFactor

    # For each interval in regions, count the number of events;
    # normalize the count by library_scaling_constant.
    # The last call to .iloc should be able to accomodate region BED files with arbitrary numbers of fields
    intersect_df = regions_bed.intersect(input_file, c=True, sorted=True).to_dataframe().iloc[:, [0, 1, 2, -1]]
    intersect_df.columns = ["chrom", "start", "end", "rawCount"]
    intersect_df["normCount"] = intersect_df["rawCount"] / library_scaling_constant

    # If lengthFactor has been provided, calculate normalized rates of events
    if lengthFactor:
        intersect_df["normRate"] = intersect_df["normCount"] / (
            (intersect_df["end"] - intersect_df["start"]) / lengthFactor
        )

    # Return a BedTool object
    if lengthFactor:
        return BedTool.from_dataframe(
            intersect_df[["chrom", "start", "end", "normRate"]]
        )
    else:
        return BedTool.from_dataframe(
            intersect_df[["chrom", "start", "end", "normCount"]]
        )


[docs]def normalize_from_command_line(args):
    """Wrapper function for the command line function ``blockify normalize``

    Parameters
    ----------
    args: ``argparse.Namespace`` object
        Input from command line

    Returns
    -------
    bedgraph: BedTool
        Normalized command line data in bedGraph format
    """

    input_file = BedTool(args.input)
    # If regions has been supplied, use it;
    if args.regions:
        regions_bed = BedTool(args.regions)
    # otherwise, segment the file
    else:
        region_segmentation = segmentation.segment(
            input_file, args.method, p0=args.p0, prior=args.prior
        )
        regions_bed = BedTool.from_dataframe(region_segmentation.df)
    return normalize(input_file, regions_bed, args.libraryFactor, args.lengthFactor)