r'''
Usage:
detk-outlier entropy <counts_fn> [options]
detk-outlier shrink [options] <counts_fn>
'''
TODO = '''
detk-outlier trim [options] <counts_fn>
'''
cmd_opts = {
'entropy':r'''
Usage:
detk-outlier entropy <counts_fn> [options]
Options:
-p P --percentile=P Float value between 0 and 1
-o FILE --output=FILE Name of the ouput csv
--plot-output=FILE Name of the plot png
''',
'shrink':r'''
Usage:
detk-outlier shrink [options] <counts_fn>
Options:
-o FILE --output=FILE Destination of primary output [default: stdout]
-f N --shrink-factor=N Shrinkage factor number float between 0 and 1 [default: 0.25]
-p N --p-max=N Percent counts of sample default is sqrt(1/num samples)
-i N --iters=N Number of terations [default: 1000]
''',
}
import csv, os
from docopt import docopt
import matplotlib as plt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as sc
import sys
from .common import CountMatrixFile, DetkModule, _cli_doc
from .report import DetkReport
from .util import stub
def pmf_transform(count_obj, shrink_factor=0.25, p_max=None, iters=1000):
obj = PMFTransform(count_obj, shrink_factor, p_max, iters)
return obj.output
class PMFTransform(DetkModule):
def __init__(self, count_obj, shrink_factor=0.25, p_max=None, iters=1000):
self['params'] = {
'shrink_factor': shrink_factor,
'p_max': p_max,
'iters': iters
}
count_obj = count_obj.copy()
p_max = p_max or np.sqrt(1./len(count_obj))
for i in range(iters) :
p_count = count_obj/count_obj.sum()
if count_obj.sum() == 0 :
print('all samples set to zero, returning')
break
p_count_outliers = p_count>p_max
if not p_count_outliers.any() :
break # done
max_non_outliers = max(count_obj[~p_count_outliers])
count_obj[p_count_outliers] = max_non_outliers+(count_obj[p_count_outliers]-max_non_outliers)*shrink_factor
# if i == iters :
# print('PMF transform did not converge')
# print(p_x)
# print(p_x_outliers)
self.count_obj = count_obj
@property
def output(self):
return self.count_obj
@property
def properties(self):
return {'num_kept':len(self.count_obj)}
[docs]def shrink(count_obj, shrink_factor=0.25, p_max=None, iters=1000) :
'''
Outlier count shrinkage routine as described in Labadorf et al, PLOSONE (2015)
This algorithm identifies feature where a small number of samples contains a
disproportionately large number of the overall counts for that feature
across samples. For each feature the algorithm is as follows:
1. Divide each sample count by the sum of counts (i.e. sample count
proportions)
2. Identify samples that have >*p_max* sample count proportion
a. If no samples are identified, return the most recent set of adjusted
counts
b. Else, shrink the identified samples toward the largest sample *s* for
which P(x)<*p_max* by multiplying the difference between the
outlier sample and *s* by the shrinkage factor and replacing
*o* with *s* the shrunken count value
3. Go to 1, repeat until no samples exceed *p_max* count proportion
This strategy assumes that samples with disproportionate count contribution
are outliers and that the order of samples is correct and the magnitude is
sometimes not. The order of the samples is thus always maintained, and the
shrinking does not introduce new false positives beyond what would already
be in the dataset. The maximum proportion of reads allowed in one sample,
p, and the shrinkage factor were both set to 0.2.
Parameters
----------
count_obj: de_toolkit.CountMatrix object
counts object
shrink_factor: float
number between 0 and 1 that determines how much the residual counts of
outlier samples is shrunk in each iteration
p_max: float
number between 0 and 1 that indicates the maximum proportion of counts
a sample may have before being considered an outlier, default is
``sqrt(1/num_samples)``
'''
obj = ShrinkCounts(count_obj, shrink_factor, p_max, iters)
return obj.output
class ShrinkCounts(DetkModule):
def __init__(self, count_obj, shrink_factor=0.25, p_max=None, iters=1000):
self['params'] = {
'shrink_factor': shrink_factor,
'p_max': p_max,
'iters': iters
}
shrunk_counts = count_obj.counts.apply(
pmf_transform,
shrink_factor=shrink_factor,
p_max=p_max,
iters=iters
)
shrunk_counts = pd.DataFrame(
shrunk_counts,
index=count_obj.counts.index,
columns=count_obj.counts.columns
)
self.shrunk_counts = shrunk_counts
@property
def output(self):
return self.shrunk_counts
@property
def properties(self):
return {'num_kept': len(self.shrunk_counts)}
@stub
def trim(count_obj) :
'''
possibly implement some trimmed-mean trimming
'''
pass
[docs]def entropy(counts_obj, threshold):
'''
Calculate sample entropy for each gene and flag genes that exceed the lower
threshold'ile
Sample entropy is a metric that can be used to identify outlier samples by
locating rows which are overly influenced by a single count value. This
metric is calculated for each gene/feature *g* as follows::
p_i = c_i/sumj(c_j)
sum(p_i) = 1
H_g = -sum_i(p_i*log2(p_i))
Here, c_i is the number of counts in sample i, p_i is the fraction of reads
contributed by sample i to the overall counts of the row, and H_g is the
Shannon entropy of the row when using log2. The maximum value possible for
H is 2 when using Shannon entropy. Genes/features with very low entropy are
those where a small number of samples makes up most of the counts across
all samples.
Parameters
----------
counts_obj: de_toolkit.CountMatrix
count matrix object
threshold: float
the lower percentile below which to flag genes
Returns
-------
pandas.DataFrame
data frame with one row for each row in the input counts matrix and two
columns:
- *entropy*: the calculated entropy value for that row
- *entropy_p0_XX*: a True/False column for genes flagged as having an
entropy value less than the 0.XX percentile; *XX* is the
first two digits of the selected threshold
'''
obj = EntropyCounts(counts_obj, threshold)
return obj.output
class EntropyCounts(DetkModule):
def __init__(self, counts_obj, threshold):
self['params'] = {'threshold': threshold}
self.counts_obj = counts_obj
counts_transpose = counts_obj.counts.copy().transpose()
trshld_name = str(threshold).split('.')[1]
# check that no features have a total of zero
all_features = counts_transpose.columns.tolist()
counts_transpose = counts_transpose.loc[:, (counts_transpose != 0).any(axis=0)]
nonzero_features = counts_transpose.columns.tolist()
dropped_features = set(all_features) - set(nonzero_features)
# create a null results df for all of the dropped features
dropped_df = pd.DataFrame(columns=['entropy', 'entropy_p0_{}'.format(trshld_name)], index=dropped_features)
dropped_df.replace(dropped_df, 'Null')
# calculate the entropy over all of the features
entropy = counts_transpose.apply(func=sc.entropy, axis=0)
# gathers the features and entropy values for the respective quantile groups
entropy_threshold = np.percentile(entropy, q=threshold)
# create the results of the entropy test
# column 1 is the entropy value
# column 2 is a boolean indication whether the value is under the user described threshold
results_df = pd.DataFrame(entropy, columns=['entropy'])
results_df['entropy_p0_{}'.format(trshld_name)] = entropy < entropy_threshold
frames = [results_df, dropped_df]
results_df = pd.concat(frames)
# set the results index to be in the same order as the counts index
results_df.index = counts_obj.counts.index
self.results_df = results_df
@property
def output(self):
return self.results_df
@property
def properties(self):
return {'num_kept': len(self.results_df)
}
def plot_entropy(entropy_res, threshold, name=None, show=None):
'''
Function accepts a counts file, a cutoff threshold. The counts file should have the samples
as the columns and the features as the rows. The cutoff threshold should be a float value
between 0 and 1. If a name (in the the form of *.png) is given, the figure will be saved with
the specified name. If show is set to 'show', the plot will be shown.
'''
entropy = entropy_res['entropy']
# sort the entropy values in ascending order
entropy = entropy.sort_values(ascending=True)
entropy_threshold = np.percentile(entropy, q=threshold)
# plot histogram
fig = plt.gcf()
plt.hist(entropy, bins='auto', log=True)
plt.axvline(entropy_threshold, color='red')
plt.xlabel('Entropy')
plt.ylabel('Samples Per Bin')
plt.title('Binned Feature Entropy')
plt.legend(['P < {}'.format(threshold), 'Data'])
fig.set_size_inches(10,10)
if name == None and show != None:
plot.show()
elif name != None:
fig.savefig(name, dpi=100)
elif name != None and show != None:
fig.savefig(name, dpi=100)
plot.show()
def main(argv=sys.argv):
if '--version' in argv :
from .version import __version__
print(__version__)
return
# add the common opts to the docopt strings
cmd_opts_aug = {}
for k,v in cmd_opts.items() :
cmd_opts_aug[k] = _cli_doc(v)
if len(argv) < 2 or (len(argv) > 1 and argv[1] not in cmd_opts) :
docopt(_cli_doc(__doc__),argv)
argv = argv[1:]
cmd = argv[0]
if cmd == 'entropy' :
args = docopt(cmd_opts_aug['vst'],argv)
count_obj = CountMatrixFile(args['<counts_fn>'])
data = CountMatrixFile(args['<counts_fn>'])
pval = float(args['--percentile'])
# run the entropy_calc function
out = EntropyCounts(data.counts, pval)
if args['--plot-output'] :
plot_entropy(out_df, pval, name=plot)
elif cmd == 'trim' :
args = docopt(cmd_opts_aug['trim'],argv)
count_obj = CountMatrixFile(args['<counts_fn>'])
out = trim(count_obj)
elif cmd == 'shrink' :
args = docopt(cmd_opts_aug['shrink'],argv)
count_obj = CountMatrixFile(args['<counts_fn>'])
if args['--p-max'] is None:
args['--p-max'] = np.sqrt(1./count_obj.counts.shape[1])
out = ShrinkCounts(count_obj,
shrink_factor=float(args['--shrink-factor']),
p_max=float(args['--p-max']),
iters=int(args['--iters'])
)
if args['--output'] == 'stdout' :
f = sys.stdout
else :
f = args['--output']
out.output.to_csv(f,sep='\t')
# write out the report json
with DetkReport(args['--report-dir']) as r :
r.add_module(
out,
in_file_path=args['<counts_fn>'],
out_file_path=args['--output'],
column_data_path=args.get('--column-data'),
workdir=os.getcwd()
)
if __name__ == '__main__':
main()