Source code for de_toolkit.outlier

r'''
Usage:
    detk-outlier entropy <counts_fn> [options]
    detk-outlier shrink [options] <counts_fn>
'''
TODO = '''
    detk-outlier trim [options] <counts_fn>
'''

cmd_opts = {
    'entropy':r'''
Usage:
    detk-outlier entropy <counts_fn> [options]

Options:
    -p P --percentile=P    Float value between 0 and 1
    -o FILE --output=FILE  Name of the ouput csv
    --plot-output=FILE     Name of the plot png
''',
    'shrink':r'''
Usage:
    detk-outlier shrink [options] <counts_fn>

Options:
    -o FILE --output=FILE   Destination of primary output [default: stdout]
    -f N --shrink-factor=N  Shrinkage factor number float between 0 and 1 [default: 0.25]
    -p N --p-max=N          Percent counts of sample default is sqrt(1/num samples)
    -i N --iters=N          Number of terations [default: 1000]
''',
}

import csv, os
from docopt import docopt
import matplotlib as plt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as sc
import sys

from .common import CountMatrixFile, DetkModule, _cli_doc
from .report import DetkReport
from .util import stub

def pmf_transform(count_obj, shrink_factor=0.25, p_max=None, iters=1000):
    obj = PMFTransform(count_obj, shrink_factor, p_max, iters)
    return obj.output

class PMFTransform(DetkModule):
    def __init__(self, count_obj, shrink_factor=0.25, p_max=None, iters=1000):
        self['params'] = {
                'shrink_factor': shrink_factor,
                'p_max': p_max,
                'iters': iters
                }
        count_obj = count_obj.copy()
        p_max = p_max or np.sqrt(1./len(count_obj))

        for i in range(iters) :
            p_count = count_obj/count_obj.sum()

            if count_obj.sum() == 0 :
                print('all samples set to zero, returning')
                break

            p_count_outliers = p_count>p_max

            if not p_count_outliers.any() :
                break # done

            max_non_outliers = max(count_obj[~p_count_outliers])

            count_obj[p_count_outliers] = max_non_outliers+(count_obj[p_count_outliers]-max_non_outliers)*shrink_factor
#        if i == iters :
#            print('PMF transform did not converge')
#            print(p_x)
#            print(p_x_outliers)
        
        self.count_obj = count_obj

    @property
    def output(self):
        return self.count_obj

    @property
    def properties(self):
        return {'num_kept':len(self.count_obj)}

[docs]def shrink(count_obj, shrink_factor=0.25, p_max=None, iters=1000) :
    '''
    Outlier count shrinkage routine as described in Labadorf et al, PLOSONE (2015)

    This algorithm identifies feature where a small number of samples contains a
    disproportionately large number of the overall counts for that feature
    across samples. For each feature the algorithm is as follows:

    1. Divide each sample count by the sum of counts (i.e. sample count
       proportions)

    2. Identify samples that have >*p_max* sample count proportion

        a. If no samples are identified, return the most recent set of adjusted
           counts

        b. Else, shrink the identified samples toward the largest sample *s* for
           which P(x)<*p_max* by multiplying the difference between the
           outlier sample and *s* by the shrinkage factor and replacing
           *o* with *s* the shrunken count value

    3. Go to 1, repeat until no samples exceed *p_max* count proportion

    This strategy assumes that samples with disproportionate count contribution
    are outliers and that the order of samples is correct and the magnitude is
    sometimes not. The order of the samples is thus always maintained, and the
    shrinking does not introduce new false positives beyond what would already
    be in the dataset. The maximum proportion of reads allowed in one sample,
    p, and the shrinkage factor were both set to 0.2.

    Parameters
    ----------
    count_obj: de_toolkit.CountMatrix object
        counts object
    shrink_factor: float
        number between 0 and 1 that determines how much the residual counts of
        outlier samples is shrunk in each iteration
    p_max: float
        number between 0 and 1 that indicates the maximum proportion of counts
        a sample may have before being considered an outlier, default is
        ``sqrt(1/num_samples)``
    '''
    obj = ShrinkCounts(count_obj, shrink_factor, p_max, iters)
    return obj.output

class ShrinkCounts(DetkModule):
    def __init__(self, count_obj, shrink_factor=0.25, p_max=None, iters=1000):
        self['params'] = {
                'shrink_factor': shrink_factor,
                'p_max': p_max,
                'iters': iters
                }
        shrunk_counts = count_obj.counts.apply(
            pmf_transform,
            shrink_factor=shrink_factor,
            p_max=p_max,
            iters=iters
        )
        shrunk_counts = pd.DataFrame(
                shrunk_counts,
                index=count_obj.counts.index,
                columns=count_obj.counts.columns
            )

        self.shrunk_counts = shrunk_counts

    @property
    def output(self):
        return self.shrunk_counts

    @property
    def properties(self):
        return {'num_kept': len(self.shrunk_counts)}

@stub
def trim(count_obj) :
    '''
    possibly implement some trimmed-mean trimming
    '''
    pass

[docs]def entropy(counts_obj, threshold):
    '''
    Calculate sample entropy for each gene and flag genes that exceed the lower
    threshold'ile

    Sample entropy is a metric that can be used to identify outlier samples by
    locating rows which are overly influenced by a single count value. This
    metric is calculated for each gene/feature *g* as follows::

        p_i = c_i/sumj(c_j)
        sum(p_i) = 1
        H_g = -sum_i(p_i*log2(p_i))

    Here, c_i is the number of counts in sample i, p_i is the fraction of reads
    contributed by sample i to the overall counts of the row, and H_g is the
    Shannon entropy of the row when using log2. The maximum value possible for
    H is 2 when using Shannon entropy. Genes/features with very low entropy are
    those where a small number of samples makes up most of the counts across
    all samples.

    Parameters
    ----------
    counts_obj: de_toolkit.CountMatrix
        count matrix object
    threshold: float
        the lower percentile below which to flag genes

    Returns
    -------
    pandas.DataFrame
        data frame with one row for each row in the input counts matrix and two
        columns:
        
        - *entropy*: the calculated entropy value for that row
        - *entropy_p0_XX*: a True/False column for genes flagged as having an
          entropy value less than the 0.XX percentile; *XX* is the
          first two digits of the selected threshold
    '''
    obj = EntropyCounts(counts_obj, threshold)
    return obj.output


class EntropyCounts(DetkModule):
    def __init__(self, counts_obj, threshold):
        self['params'] = {'threshold': threshold}
        self.counts_obj = counts_obj
        
        counts_transpose = counts_obj.counts.copy().transpose()
        trshld_name = str(threshold).split('.')[1]

        # check that no features have a total of zero
        all_features = counts_transpose.columns.tolist()
        counts_transpose = counts_transpose.loc[:, (counts_transpose != 0).any(axis=0)]
        nonzero_features = counts_transpose.columns.tolist()
        dropped_features = set(all_features) - set(nonzero_features)

        # create a null results df for all of the dropped features
        dropped_df = pd.DataFrame(columns=['entropy', 'entropy_p0_{}'.format(trshld_name)], index=dropped_features)
        dropped_df.replace(dropped_df, 'Null')

        # calculate the entropy over all of the features
        entropy = counts_transpose.apply(func=sc.entropy, axis=0)

        # gathers the features and entropy values for the respective quantile groups
        entropy_threshold = np.percentile(entropy, q=threshold)

        # create the results of the entropy test
        # column 1 is the entropy value
        # column 2 is a boolean indication whether the value is under the user described threshold
        results_df = pd.DataFrame(entropy, columns=['entropy'])
        results_df['entropy_p0_{}'.format(trshld_name)] = entropy < entropy_threshold
        frames = [results_df, dropped_df]
        results_df = pd.concat(frames)
        # set the results index to be in the same order as the counts index
        results_df.index = counts_obj.counts.index
        
        self.results_df = results_df
    
    @property
    def output(self):
        return self.results_df

    @property
    def properties(self):
        return {'num_kept': len(self.results_df)
                }


def plot_entropy(entropy_res, threshold, name=None, show=None):
    '''
    Function accepts a counts file, a cutoff threshold. The counts file should have the samples
    as the columns and the features as the rows. The cutoff threshold should be a float value
    between 0 and 1. If a name (in the the form of *.png) is given, the figure will be saved with
    the specified name. If show is set to 'show', the plot will be shown.
    '''

    entropy = entropy_res['entropy']

    # sort the entropy values in ascending order
    entropy = entropy.sort_values(ascending=True)
    entropy_threshold = np.percentile(entropy, q=threshold)

    # plot histogram
    fig = plt.gcf()
    plt.hist(entropy, bins='auto', log=True)
    plt.axvline(entropy_threshold, color='red')
    plt.xlabel('Entropy')
    plt.ylabel('Samples Per Bin')
    plt.title('Binned Feature Entropy')
    plt.legend(['P < {}'.format(threshold), 'Data'])
    fig.set_size_inches(10,10)

    if name == None and show != None:
        plot.show()
    elif name != None:
        fig.savefig(name, dpi=100)
    elif name != None and show != None:
        fig.savefig(name, dpi=100)
        plot.show()

def main(argv=sys.argv):

    if '--version' in argv :
        from .version import __version__
        print(__version__)
        return

    # add the common opts to the docopt strings
    cmd_opts_aug = {}
    for k,v in cmd_opts.items() :
        cmd_opts_aug[k] = _cli_doc(v)

    if len(argv) < 2 or (len(argv) > 1 and argv[1] not in cmd_opts) :
        docopt(_cli_doc(__doc__),argv)
    argv = argv[1:]
    cmd = argv[0]

    if cmd == 'entropy' :
        args = docopt(cmd_opts_aug['vst'],argv)
        count_obj = CountMatrixFile(args['<counts_fn>'])
        data = CountMatrixFile(args['<counts_fn>'])
        pval = float(args['--percentile'])

        # run the entropy_calc function
        out = EntropyCounts(data.counts, pval)

        if args['--plot-output'] :
            plot_entropy(out_df, pval, name=plot)

    elif cmd == 'trim' :
        args = docopt(cmd_opts_aug['trim'],argv)
        count_obj = CountMatrixFile(args['<counts_fn>'])
        out = trim(count_obj)

    elif cmd == 'shrink' :
        args = docopt(cmd_opts_aug['shrink'],argv)
        count_obj = CountMatrixFile(args['<counts_fn>'])
        if args['--p-max'] is None:
            args['--p-max'] = np.sqrt(1./count_obj.counts.shape[1])
        out = ShrinkCounts(count_obj,
                shrink_factor=float(args['--shrink-factor']),
                p_max=float(args['--p-max']),
                iters=int(args['--iters'])
                )
        
    if args['--output'] == 'stdout' :
        f = sys.stdout
    else :
        f = args['--output']

    out.output.to_csv(f,sep='\t')

    # write out the report json
    with DetkReport(args['--report-dir']) as r :
        r.add_module(
                out,
                in_file_path=args['<counts_fn>'],
                out_file_path=args['--output'],
                column_data_path=args.get('--column-data'),
                workdir=os.getcwd()
                )

if __name__ == '__main__':
    main()