Source code for blockify.normalization

from pybedtools import BedTool
from . import segmentation
from . import utilities
import warnings

# Generates a normalized bedgraph of event rate for BED-formatted genomic data.
# The output file can be visualized on genome browsers.

# Suppress certain warnings
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter("ignore", category=ResourceWarning)


[docs]def validateNormalizationArguments(input_file, regions_bed, libraryFactor, lengthFactor): """Validates parameters passed via the command line. Parameters ---------- input_file: BedTool object BedTool object (instantiated from pybedtools) for input data regions_bed: BedTool object BedTool object (instantiated from pybedtools) for regions over which we are normalizing input_file libraryFactor: float Scalar to normalize by input_file's library size. lengthFactor: float or None Scalar to normalize by each block's length. If None, no length normalization is performed. Returns ------- None: None """ # Check that the input file is sorted assert utilities.isSortedBEDObject(input_file), "input file must be sorted" # Check that regions is also sorted assert utilities.isSortedBEDObject(regions_bed), "regions BED file must be sorted" # Check libraryFactor assert libraryFactor > 0, "--libraryFactor should be a positive number" # Check lengthFactor if lengthFactor is not None: assert lengthFactor > 0, "--lengthFactor should be a positive number"
[docs]def normalize(input_file, regions_bed, libraryFactor, lengthFactor): """Core normalization method Parameters ---------- input_file: BedTool object BedTool object (instantiated from pybedtools) for input data regions_bed: BedTool object BedTool object (instantiated from pybedtools) for regions over which we are normalizing input_file libraryFactor: float Scalar to normalize by input_file's library size. lengthFactor: float or None Scalar to normalize by each block's length. If None, no length normalization is performed. Returns ------- bedgraph: BedTool A BedTool object in bedGraph format, using the intervals supplied in regions_bed """ # input_file and regions_bed are BedTool objects # Calculate library scaling constant, which is the total number # Validate normalization arguments validateNormalizationArguments(input_file, regions_bed, libraryFactor, lengthFactor) # of events in input BED divided by the library factor library_scaling_constant = len(input_file.to_dataframe()) / libraryFactor # For each interval in regions, count the number of events; # normalize the count by library_scaling_constant. # The last call to .iloc should be able to accomodate region BED files with arbitrary numbers of fields intersect_df = regions_bed.intersect(input_file, c=True, sorted=True).to_dataframe().iloc[:, [0, 1, 2, -1]] intersect_df.columns = ["chrom", "start", "end", "rawCount"] intersect_df["normCount"] = intersect_df["rawCount"] / library_scaling_constant # If lengthFactor has been provided, calculate normalized rates of events if lengthFactor: intersect_df["normRate"] = intersect_df["normCount"] / ( (intersect_df["end"] - intersect_df["start"]) / lengthFactor ) # Return a BedTool object if lengthFactor: return BedTool.from_dataframe( intersect_df[["chrom", "start", "end", "normRate"]] ) else: return BedTool.from_dataframe( intersect_df[["chrom", "start", "end", "normCount"]] )
[docs]def normalize_from_command_line(args): """Wrapper function for the command line function ``blockify normalize`` Parameters ---------- args: ``argparse.Namespace`` object Input from command line Returns ------- bedgraph: BedTool Normalized command line data in bedGraph format """ input_file = BedTool(args.input) # If regions has been supplied, use it; if args.regions: regions_bed = BedTool(args.regions) # otherwise, segment the file else: region_segmentation = segmentation.segment( input_file, args.method, p0=args.p0, prior=args.prior ) regions_bed = BedTool.from_dataframe(region_segmentation.df) return normalize(input_file, regions_bed, args.libraryFactor, args.lengthFactor)