Source code for blockify.annotation

import numpy as np
import pandas as pd
from pybedtools import BedTool
from . import segmentation
import scipy.stats as stats
import statsmodels.stats.multitest as multitest
import sys
from . import utilities
import warnings

# Suppress certain warnings
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter("ignore", category=ResourceWarning)

# Sub-list collapsing method from https://stackoverflow.com/a/952952
CORRECTION_METHODS = [method for sublist in multitest._alias_list for method in sublist]


[docs]def validateAnnotationArguments(
    input_file,
    regions_bed,
    background_file,
    measure,
    alpha,
    correction,
    p_value,
    distance,
    min_size,
    max_size,
    pseudocount,
):
    """Validates parameters passed via the command line.

    Parameters
    ----------
    input_file: BedTool object
        BedTool object (instantiated from pybedtools) for input data
    regions_bed: BedTool object
        BedTool object (instantiated from pybedtools) for regions over which we are annotation/calling peaks
    background_file: BedTool object
        BedTool object (instantiated from pybedtools) used to parameterize the background model
    measure: str
        Either "enrichment" or "depletion" to indicate which direction of effect to test for
    alpha: float or None
        Multiple-hypothesis adjusted threshold for calling significance
    correction: str or None
        Multiple hypothesis correction to perform (see ``statsmodels.stats.multitest`` for valid values)
    p_value: float or None
        Straight p-value cutoff (unadjusted) for calling significance
    distance: int or None
        Merge significant features within specified distance cutoff
    min_size: int or None
        Minimum size cutoff for peaks
    max_size: int or None
        Maximum size cutoff for peaks
    pseudocount: float
        Pseudocount added to adjust background model

    Returns
    -------
    None: None
    """

    # Check that the input file is sorted
    assert utilities.isSortedBEDObject(input_file), "input file must be sorted"
    # Check that regions_bed is sorted
    assert utilities.isSortedBEDObject(regions_bed), "regions BED file must be sorted"
    # Check that background_file is sorted
    assert utilities.isSortedBEDObject(
        background_file
    ), "background file must be sorted"
    # Check that measure is a valid parameter
    assert measure in ["enrichment", "depletion"], "measurement must be either 'enrichment' or 'depletion'"
    # If regions has been supplied, check that they are also sorted
    if regions_bed:
        assert utilities.isSortedBEDFile(regions_bed), "regions BED file must be sorted"
    # Check that alpha or pValueCutoff are valid
    if alpha:
        assert 0 <= alpha <= 1, "--alpha should be between 0 and 1, inclusive"
        # If alpha was provided, correction should have been provided
        # correction should be any valid method in statsmodels.stats.multitest
        assert (
            correction in CORRECTION_METHODS
        ), "invalid multiple hypothesis correction method"
    if p_value:
        assert 0 <= p_value <= 1, "--pValueCutoff should be non-negative"
    # Check that the other integers, if provided, are valid
    if distance:
        assert distance >= 0, "--distance should be a non-negative integer"
    if min_size:
        assert min_size >= 0, "--min should be a non-negative integer"
    if max_size:
        assert max_size >= 0, "--max should be a non-negative integer"
    # Check pseudocount is a positive real number
    assert pseudocount >= 0, "--pseudocount should be non-negative"


[docs]def tighten(data):
    """Tightens block boundaries in a BedTool file. This function modifies block boundaries so that they coincide with data points.

    Parameters
    ----------
    data: BedTool object
        Input file of block boundaries

    Returns
    -------
    refined: BedTool object
        BedTool of tightened blocks
    """

    # data is a BedTool; return value is also BedTool
    # Calculate new boundaries based on first and last event locations in the region
    df = data.to_dataframe()
    df = df.iloc[:, :6]
    df = df.rename(
        index=str,
        columns={"name": "TTAA_chrom", "score": "TTAA_start", "strand": "TTAA_end"},
    )
    df = df.sort_values(
        ["chrom", "start", "end", "TTAA_chrom", "TTAA_start", "TTAA_end"]
    )
    groups = df.groupby(["chrom", "start", "end"])
    first = groups.nth(0)["TTAA_start"]
    last = groups.nth(-1)["TTAA_end"]
    joined = pd.concat([first, last], axis=1).reset_index()
    refined = joined[["chrom", "TTAA_start", "TTAA_end"]]
    return BedTool.from_dataframe(refined)


[docs]def parcelConsecutiveBlocks(df):
    """Concatenates consecutive blocks into a DataFrame. If there are multiple non-contiguous sets of consecutive blocks, creates one DataFrame per set.

    Parameters
    ----------
    df: ``pandas`` DataFrame
        Input set of blocks as a DataFrame

    Returns
    -------
    outlist: list of ``pandas`` DataFrames
        List of DataFrames, each of which is a set of consecutive blocks
    """

    # df is a pandas DataFrame; return value is a list of
    # DataFrames, each of which contains consecutive blocks
    outlist = []
    chroms = np.unique(df.chrom)
    for chrom in chroms:
        chrom_rows = df[df.chrom == chrom].reset_index()
        for i, row in chrom_rows.iterrows():
            i = int(i)
            if i == 0:
                # Start of a new chromosome
                outlist.append([row])
            else:
                # Check if this block is adjacent to the previous block
                if chrom_rows.iloc[i].start == chrom_rows.iloc[i - 1].end:
                    # If so, append to the previous list
                    outlist[-1].append(row)
                else:
                    # If not, create a new list
                    outlist.append([row])
    return [pd.DataFrame(_) for _ in outlist]


[docs]def getPeakSummits(df, metric="pValue"):
    """From a list of peaks, get a set of peak summits

    Parameters
    ----------
    df: ``pandas`` DataFrame
        Set of peaks from ``annotate`` as a DataFrame
    metric: str
        Metric to use when filtering for summits. One of "pValue" or "density"

    Returns
    -------
    summits: ``pandas`` DataFrame
        Set of peak summits as a DataFrame
    """

    # df is a pandas DataFrame; return value is also a DataFrame
    assert metric in ["pValue", "density"]
    if metric == "pValue":
        column = "negLog10corrected"
    elif metric == "density":
        column = "Net_density"
    else:
        # Something unexpected happened
        print("Unexpected error:", sys.exc_info()[0])
        raise
    return pd.concat([_[_[column] == _[column].max()] for _ in parcelConsecutiveBlocks(df)]).reset_index().drop(columns=["level_0", "index"])


[docs]def sizeFilter(bed, min_size, max_size):
    """Filter peaks by size.

    Parameters
    ----------
    bed: BedTool object
        Input data file
    min_size: int
        Lower bound for peak size
    max_size: int
        Upper bound for peak size

    Returns
    -------
    filtered_peaks: BedTool object
        Peaks after size selection
    """

    # For size filter, convert to DataFrame, filter, and go back to BED
    df = bed.to_dataframe()
    df["size"] = df["end"] - df["start"]
    df = df[df["size"] <= max_size]
    df = df[df["size"] >= min_size]
    # Also set columns for name, score, and strand
    df["name"] = "peak_" + df.index.astype(str)
    df["score"] = 1
    df["strand"] = "."
    return BedTool.from_dataframe(df[["chrom", "start", "end", "name", "score", "strand"]])


[docs]def annotate(
    input_file,
    regions_bed,
    background_file,
    measure="enrichment",
    intermediate=None,
    alpha=None,
    correction=None,
    p_value=None,
    distance=None,
    min_size=None,
    max_size=None,
    pseudocount=1,
    tight=False,
    summit=False,
):
    """Core annotation and peak calling method.

    Parameters
    ----------
    input_file: BedTool object
        BedTool object (instantiated from pybedtools) for input data
    regions_bed: BedTool object
        BedTool object (instantiated from pybedtools) for regions over which we are annotation/calling peaks
    background_file: BedTool object
        BedTool object (instantiated from pybedtools) used to parameterize the background model
    measure: str
        Either "enrichment" or "depletion" to indicate which direction of effect to test for
    intermediate: bool
        Whether or not to return intermediate calculations during peak calling
    alpha: float or None
        Multiple-hypothesis adjusted threshold for calling significance
    correction: str or None
        Multiple hypothesis correction to perform (see ``statsmodels.stats.multitest`` for valid values)
    p_value: float or None
        Straight p-value cutoff (unadjusted) for calling significance
    distance: int or None
        Merge significant features within specified distance cutoff
    min_size: int or None
        Minimum size cutoff for peaks
    max_size: int or None
        Maximum size cutoff for peaks
    pseudocount: float
        Pseudocount added to adjust background model
    tight: bool
        Whether to tighten the regions in ``regions_bed``
    summit: bool
        Whether to return peak summits instead of full peaks

    Returns
    -------
    out_bed: BedTool object
        Set of peaks in BED6 format
    df: ``pandas`` DataFrame or None
        If ``intermediate`` specified, DataFrame containing intermediate calculations during peak calling
    """

    # input_file, regions, and background_file are BedTool objects
    # Validate annotation arguments
    validateAnnotationArguments(
        input_file,
        regions_bed,
        background_file,
        measure,
        alpha,
        correction,
        p_value,
        distance,
        min_size,
        max_size,
        pseudocount,
    )
    # Calculate scaling factor
    scalingFactor = len(input_file.to_dataframe()) / len(background_file.to_dataframe())

    # Pull region edges to the nearest event in input, if specified
    if tight:
        data = regions_bed.intersect(input_file, wa=True, wb=True, sorted=True)
        regions_bed = tighten(data)

    # Intersect regions with the input file
    data = regions_bed.intersect(input_file, c=True, sorted=True).intersect(background_file, c=True, sorted=True)
    # Convert to DataFrame
    df = data.to_dataframe()
    df = df.rename(index=str, columns={df.columns[-2]: "Input", df.columns[-1]: "Background"})

    # Calculate the normalized background (Normed_bg) number of events
    # by multiplying Background by scalingFactor. Then add the pseudocount,
    # in the case of Normed_bg; and the floor of the pseudocount to Input.
    # This preserves log-fold change if Background is 0, and keeps the value
    # added to Input an integer (if pseudocount is a float).
    df["Input"] += np.floor(pseudocount)
    df["Normed_bg"] = df["Background"] * scalingFactor + pseudocount

    # Calculate density of insertions in each block
    df["Net_density"] = (df["Input"] - df["Normed_bg"]) / (df["end"] - df["start"])

    if measure == "enrichment":
        # Calculate the one-tail Poisson p-value of observing Input or more number of events
        # given a lambda of Normed_bg. Use the survival function (sf), which is 1 - CDF.
        # Need to specify Input - 1 because for discrete distributions, 1 - cdf(x) is p(X ≥ x + 1);
        # sf(x - 1) is thus p(X ≥ x).
        df["pValue"] = stats.poisson.sf(df["Input"] - 1, df["Normed_bg"])
    elif measure == "depletion":
        # Calculate the one-tail Poisson p-value of observing Input or fewer number of events
        # given a lambda of Normed_bg. Use the cumulative distribution function (cdf).
        # cdf(x) is  p(X ≤ x).
        df["pValue"] = stats.poisson.cdf(df["Input"], df["Normed_bg"])
    else:
        # Something unexpected happened
        print("Unexpected error:", sys.exc_info()[0])
        raise
    # Replace 0's with 1/FLOAT_MAX to obtain finite -log10(pValue)
    df.replace(to_replace=0, value=1 / utilities.FLOAT_MAX, inplace=True)
    df["negLog10pValue"] = -np.log10(df["pValue"])

    # If p_value has been provided, filter by it; else, perform multiple hypothesis correction
    if p_value:
        df["rejected"] = df["pValue"] <= p_value
        out_df = df[df["pValue"] <= p_value]
    else:
        df["rejected"], df[
            "corrected_pValue"
        ], alphacSidak, alphacBonf = multitest.multipletests(
            df["pValue"],
            alpha=alpha,
            method=correction,
            is_sorted=False,
            returnsorted=False,
        )
        df["negLog10corrected"] = -np.log10(df["corrected_pValue"])
        out_df = df[df["rejected"]]

    # Return peak summits, if specified
    if summit:
        out_df = getPeakSummits(out_df)

    # Convert out_df to out_bed
    out_bed = BedTool.from_dataframe(out_df)

    # Merge peaks within distance, if specified
    if distance is not None:
        out_bed = out_bed.merge(d=distance)

    # Filter by minimum and/or maximum size, if specified
    if min_size:
        minSize = min_size
    else:
        minSize = 0

    if max_size:
        maxSize = max_size
    else:
        maxSize = np.inf
    out_bed = sizeFilter(out_bed, minSize, maxSize)

    # Return out_bed and intermediate file, if any
    if intermediate:
        return out_bed, df
    else:
        return out_bed, None


[docs]def annotate_from_command_line(args):
    """Wrapper function for the command line function ``blockify call``

    Parameters
    ----------
    args: ``argparse.Namespace`` object
        Input from command line

    Returns
    -------
    out_bed: BedTool object
        Set of peaks in BED6 format
    df: ``pandas`` DataFrame or None
        If ``intermediate`` specified, DataFrame containing intermediate calculations during peak calling
    """

    input_file = BedTool(args.input)
    # If regions has been supplied, use it;
    if args.regions:
        regions_bed = BedTool(args.regions)
    # otherwise, segment the file
    else:
        region_segmentation = segmentation.segment(
            input_file, args.method, p0=args.p0, prior=args.prior
        )
        regions_bed = BedTool.from_dataframe(region_segmentation.df)
    background_file = BedTool(args.background)

    return annotate(
        input_file,
        regions_bed,
        background_file,
        measure=args.measure,
        intermediate=args.intermediate,
        alpha=args.alpha,
        p_value=args.pValueCutoff,
        correction=args.correction,
        distance=args.distance,
        min_size=args.min,
        max_size=args.max,
        pseudocount=args.pseudocount,
        tight=args.tight,
        summit=args.summit,
    )