Source code for blockify.annotation

import numpy as np
import pandas as pd
from pybedtools import BedTool
from . import segmentation
import scipy.stats as stats
import statsmodels.stats.multitest as multitest
import sys
from . import utilities
import warnings

# Suppress certain warnings
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter("ignore", category=ResourceWarning)

# Sub-list collapsing method from https://stackoverflow.com/a/952952
CORRECTION_METHODS = [method for sublist in multitest._alias_list for method in sublist]


[docs]def validateAnnotationArguments( input_file, regions_bed, background_file, measure, alpha, correction, p_value, distance, min_size, max_size, pseudocount, ): """Validates parameters passed via the command line. Parameters ---------- input_file: BedTool object BedTool object (instantiated from pybedtools) for input data regions_bed: BedTool object BedTool object (instantiated from pybedtools) for regions over which we are annotation/calling peaks background_file: BedTool object BedTool object (instantiated from pybedtools) used to parameterize the background model measure: str Either "enrichment" or "depletion" to indicate which direction of effect to test for alpha: float or None Multiple-hypothesis adjusted threshold for calling significance correction: str or None Multiple hypothesis correction to perform (see ``statsmodels.stats.multitest`` for valid values) p_value: float or None Straight p-value cutoff (unadjusted) for calling significance distance: int or None Merge significant features within specified distance cutoff min_size: int or None Minimum size cutoff for peaks max_size: int or None Maximum size cutoff for peaks pseudocount: float Pseudocount added to adjust background model Returns ------- None: None """ # Check that the input file is sorted assert utilities.isSortedBEDObject(input_file), "input file must be sorted" # Check that regions_bed is sorted assert utilities.isSortedBEDObject(regions_bed), "regions BED file must be sorted" # Check that background_file is sorted assert utilities.isSortedBEDObject( background_file ), "background file must be sorted" # Check that measure is a valid parameter assert measure in ["enrichment", "depletion"], "measurement must be either 'enrichment' or 'depletion'" # If regions has been supplied, check that they are also sorted if regions_bed: assert utilities.isSortedBEDFile(regions_bed), "regions BED file must be sorted" # Check that alpha or pValueCutoff are valid if alpha: assert 0 <= alpha <= 1, "--alpha should be between 0 and 1, inclusive" # If alpha was provided, correction should have been provided # correction should be any valid method in statsmodels.stats.multitest assert ( correction in CORRECTION_METHODS ), "invalid multiple hypothesis correction method" if p_value: assert 0 <= p_value <= 1, "--pValueCutoff should be non-negative" # Check that the other integers, if provided, are valid if distance: assert distance >= 0, "--distance should be a non-negative integer" if min_size: assert min_size >= 0, "--min should be a non-negative integer" if max_size: assert max_size >= 0, "--max should be a non-negative integer" # Check pseudocount is a positive real number assert pseudocount >= 0, "--pseudocount should be non-negative"
[docs]def tighten(data): """Tightens block boundaries in a BedTool file. This function modifies block boundaries so that they coincide with data points. Parameters ---------- data: BedTool object Input file of block boundaries Returns ------- refined: BedTool object BedTool of tightened blocks """ # data is a BedTool; return value is also BedTool # Calculate new boundaries based on first and last event locations in the region df = data.to_dataframe() df = df.iloc[:, :6] df = df.rename( index=str, columns={"name": "TTAA_chrom", "score": "TTAA_start", "strand": "TTAA_end"}, ) df = df.sort_values( ["chrom", "start", "end", "TTAA_chrom", "TTAA_start", "TTAA_end"] ) groups = df.groupby(["chrom", "start", "end"]) first = groups.nth(0)["TTAA_start"] last = groups.nth(-1)["TTAA_end"] joined = pd.concat([first, last], axis=1).reset_index() refined = joined[["chrom", "TTAA_start", "TTAA_end"]] return BedTool.from_dataframe(refined)
[docs]def parcelConsecutiveBlocks(df): """Concatenates consecutive blocks into a DataFrame. If there are multiple non-contiguous sets of consecutive blocks, creates one DataFrame per set. Parameters ---------- df: ``pandas`` DataFrame Input set of blocks as a DataFrame Returns ------- outlist: list of ``pandas`` DataFrames List of DataFrames, each of which is a set of consecutive blocks """ # df is a pandas DataFrame; return value is a list of # DataFrames, each of which contains consecutive blocks outlist = [] chroms = np.unique(df.chrom) for chrom in chroms: chrom_rows = df[df.chrom == chrom].reset_index() for i, row in chrom_rows.iterrows(): i = int(i) if i == 0: # Start of a new chromosome outlist.append([row]) else: # Check if this block is adjacent to the previous block if chrom_rows.iloc[i].start == chrom_rows.iloc[i - 1].end: # If so, append to the previous list outlist[-1].append(row) else: # If not, create a new list outlist.append([row]) return [pd.DataFrame(_) for _ in outlist]
[docs]def getPeakSummits(df, metric="pValue"): """From a list of peaks, get a set of peak summits Parameters ---------- df: ``pandas`` DataFrame Set of peaks from ``annotate`` as a DataFrame metric: str Metric to use when filtering for summits. One of "pValue" or "density" Returns ------- summits: ``pandas`` DataFrame Set of peak summits as a DataFrame """ # df is a pandas DataFrame; return value is also a DataFrame assert metric in ["pValue", "density"] if metric == "pValue": column = "negLog10corrected" elif metric == "density": column = "Net_density" else: # Something unexpected happened print("Unexpected error:", sys.exc_info()[0]) raise return pd.concat([_[_[column] == _[column].max()] for _ in parcelConsecutiveBlocks(df)]).reset_index().drop(columns=["level_0", "index"])
[docs]def sizeFilter(bed, min_size, max_size): """Filter peaks by size. Parameters ---------- bed: BedTool object Input data file min_size: int Lower bound for peak size max_size: int Upper bound for peak size Returns ------- filtered_peaks: BedTool object Peaks after size selection """ # For size filter, convert to DataFrame, filter, and go back to BED df = bed.to_dataframe() df["size"] = df["end"] - df["start"] df = df[df["size"] <= max_size] df = df[df["size"] >= min_size] # Also set columns for name, score, and strand df["name"] = "peak_" + df.index.astype(str) df["score"] = 1 df["strand"] = "." return BedTool.from_dataframe(df[["chrom", "start", "end", "name", "score", "strand"]])
[docs]def annotate( input_file, regions_bed, background_file, measure="enrichment", intermediate=None, alpha=None, correction=None, p_value=None, distance=None, min_size=None, max_size=None, pseudocount=1, tight=False, summit=False, ): """Core annotation and peak calling method. Parameters ---------- input_file: BedTool object BedTool object (instantiated from pybedtools) for input data regions_bed: BedTool object BedTool object (instantiated from pybedtools) for regions over which we are annotation/calling peaks background_file: BedTool object BedTool object (instantiated from pybedtools) used to parameterize the background model measure: str Either "enrichment" or "depletion" to indicate which direction of effect to test for intermediate: bool Whether or not to return intermediate calculations during peak calling alpha: float or None Multiple-hypothesis adjusted threshold for calling significance correction: str or None Multiple hypothesis correction to perform (see ``statsmodels.stats.multitest`` for valid values) p_value: float or None Straight p-value cutoff (unadjusted) for calling significance distance: int or None Merge significant features within specified distance cutoff min_size: int or None Minimum size cutoff for peaks max_size: int or None Maximum size cutoff for peaks pseudocount: float Pseudocount added to adjust background model tight: bool Whether to tighten the regions in ``regions_bed`` summit: bool Whether to return peak summits instead of full peaks Returns ------- out_bed: BedTool object Set of peaks in BED6 format df: ``pandas`` DataFrame or None If ``intermediate`` specified, DataFrame containing intermediate calculations during peak calling """ # input_file, regions, and background_file are BedTool objects # Validate annotation arguments validateAnnotationArguments( input_file, regions_bed, background_file, measure, alpha, correction, p_value, distance, min_size, max_size, pseudocount, ) # Calculate scaling factor scalingFactor = len(input_file.to_dataframe()) / len(background_file.to_dataframe()) # Pull region edges to the nearest event in input, if specified if tight: data = regions_bed.intersect(input_file, wa=True, wb=True, sorted=True) regions_bed = tighten(data) # Intersect regions with the input file data = regions_bed.intersect(input_file, c=True, sorted=True).intersect(background_file, c=True, sorted=True) # Convert to DataFrame df = data.to_dataframe() df = df.rename(index=str, columns={df.columns[-2]: "Input", df.columns[-1]: "Background"}) # Calculate the normalized background (Normed_bg) number of events # by multiplying Background by scalingFactor. Then add the pseudocount, # in the case of Normed_bg; and the floor of the pseudocount to Input. # This preserves log-fold change if Background is 0, and keeps the value # added to Input an integer (if pseudocount is a float). df["Input"] += np.floor(pseudocount) df["Normed_bg"] = df["Background"] * scalingFactor + pseudocount # Calculate density of insertions in each block df["Net_density"] = (df["Input"] - df["Normed_bg"]) / (df["end"] - df["start"]) if measure == "enrichment": # Calculate the one-tail Poisson p-value of observing Input or more number of events # given a lambda of Normed_bg. Use the survival function (sf), which is 1 - CDF. # Need to specify Input - 1 because for discrete distributions, 1 - cdf(x) is p(X ≥ x + 1); # sf(x - 1) is thus p(X ≥ x). df["pValue"] = stats.poisson.sf(df["Input"] - 1, df["Normed_bg"]) elif measure == "depletion": # Calculate the one-tail Poisson p-value of observing Input or fewer number of events # given a lambda of Normed_bg. Use the cumulative distribution function (cdf). # cdf(x) is p(X ≤ x). df["pValue"] = stats.poisson.cdf(df["Input"], df["Normed_bg"]) else: # Something unexpected happened print("Unexpected error:", sys.exc_info()[0]) raise # Replace 0's with 1/FLOAT_MAX to obtain finite -log10(pValue) df.replace(to_replace=0, value=1 / utilities.FLOAT_MAX, inplace=True) df["negLog10pValue"] = -np.log10(df["pValue"]) # If p_value has been provided, filter by it; else, perform multiple hypothesis correction if p_value: df["rejected"] = df["pValue"] <= p_value out_df = df[df["pValue"] <= p_value] else: df["rejected"], df[ "corrected_pValue" ], alphacSidak, alphacBonf = multitest.multipletests( df["pValue"], alpha=alpha, method=correction, is_sorted=False, returnsorted=False, ) df["negLog10corrected"] = -np.log10(df["corrected_pValue"]) out_df = df[df["rejected"]] # Return peak summits, if specified if summit: out_df = getPeakSummits(out_df) # Convert out_df to out_bed out_bed = BedTool.from_dataframe(out_df) # Merge peaks within distance, if specified if distance is not None: out_bed = out_bed.merge(d=distance) # Filter by minimum and/or maximum size, if specified if min_size: minSize = min_size else: minSize = 0 if max_size: maxSize = max_size else: maxSize = np.inf out_bed = sizeFilter(out_bed, minSize, maxSize) # Return out_bed and intermediate file, if any if intermediate: return out_bed, df else: return out_bed, None
[docs]def annotate_from_command_line(args): """Wrapper function for the command line function ``blockify call`` Parameters ---------- args: ``argparse.Namespace`` object Input from command line Returns ------- out_bed: BedTool object Set of peaks in BED6 format df: ``pandas`` DataFrame or None If ``intermediate`` specified, DataFrame containing intermediate calculations during peak calling """ input_file = BedTool(args.input) # If regions has been supplied, use it; if args.regions: regions_bed = BedTool(args.regions) # otherwise, segment the file else: region_segmentation = segmentation.segment( input_file, args.method, p0=args.p0, prior=args.prior ) regions_bed = BedTool.from_dataframe(region_segmentation.df) background_file = BedTool(args.background) return annotate( input_file, regions_bed, background_file, measure=args.measure, intermediate=args.intermediate, alpha=args.alpha, p_value=args.pValueCutoff, correction=args.correction, distance=args.distance, min_size=args.min, max_size=args.max, pseudocount=args.pseudocount, tight=args.tight, summit=args.summit, )