Source code for blockify.downsampling

import numpy as np
import pandas as pd
from pybedtools import BedTool

[docs]def downsample(input_file, n, seed=None, naive=False): """Core downsampling method Parameters ---------- input_file: ``pandas`` DataFrame Input data (e.g. BED, qBED, CCF) as a ``pandas`` DataFrame n: int Number of entries to sample seed: int Seed for random number generator naive: bool Choose whether to sample each entry with equal probability (True) or weighted by the value in the fourth column (if supplied) Returns ------- downsampled_file: BedTool object Input file after downsampling """ # input_file is a pandas DataFrame total = np.sum(input_file[3]) # total if naive: # Select rows with equal likelihood p = None else: # Select rows with proportional weights p = input_file[3] / total if seed is not None: # set random seed if provided np.random.seed(seed) # Sample rows indexes = np.random.choice(np.arange(len(input_file)), size=n, replace=False, p=p) indexes.sort() downsampled_file = input_file.iloc[indexes] return BedTool.from_dataframe(downsampled_file)
# Downsample a qBED file from the command line # Thin wrapper for downsample()
[docs]def downsample_from_command_line(args): """Wrapper function for the command line function ``blockify downsample`` Parameters ---------- args: ``argparse.Namespace`` object Input from command line Returns ------- downsampled_file: BedTool Downsampled command line data """ input_file = pd.read_table(args.input, header=None) # Segment the input file return downsample(input_file, args.number, seed=args.seed, naive=args.naive)