Source code for de_toolkit.transform

r'''
Usage:
    detk-transform plog [options] <count_fn>
    detk-transform vst [options] <count_fn>
    detk-transform rlog [options] <count_fn> [<design> <cov_fn>]
'''
TODO = '''
    detk-transform ruvseq <count_fn>
'''

cmd_opts = {
    'vst':r'''
Usage:
    detk-transform vst [options] <count_fn>

Options:
    -o FILE --output=FILE  Destination of primary output [default: stdout]
    --rda=RDA              Filename passed to saveRDS() R function of the result
                           objects from the analysis
''',
    'plog':r'''
Usage:
    detk-transform plog [options] <count_fn>

Options:
    -c N --pseudocount=N   The pseudocount to use when taking the log transform [default: 1]
    -b B --base=B          The base of the log to use [default: 10]
    -o FILE --output=FILE  Destination of primary output [default: stdout]
''',
    'rlog':r'''
Usage:
    detk-transform rlog [options] <count_fn> [<design> <cov_fn>]

Options:
    -o FILE --output=FILE  Destination of primary output [default: stdout]
    --rda=RDA              Filename passed to saveRDS() R function of the result
                           objects from the analysis
    --strict               Require that the sample order indicated by the column names in the
                           counts file are the same as, and in the same order as, the
                           sample order in the row names of the covariates file
    --blind N              If False, count_obj is expected to have column_data
''',
}

from docopt import docopt
import math, os
import numpy
import pandas
import sys
from .common import CountMatrixFile, DetkModule, _cli_doc
from .wrapr import (
                require_r, require_deseq2, wrapr, RExecutionError, RPackageMissing,
                require_r_package
        )
from .util import stub
from .report import DetkReport

[docs]def plog(count_obj,pseudocount=1,base=10) : ''' Logarithmic transform of a counts matrix with fixed pseudocount, i.e. $\\log(x+c)$ Parameters ---------- count_obj : CountMatrix object count matrix object Returns ------- pandas.DataFrame log transformed counts dataframe with the same dimensionality as input counts ''' obj = PlogCounts(count_obj, pseudocount, base) return obj.output
class PlogCounts(DetkModule): def __init__(self, count_obj, pseudocount=1, base=10): self['params'] = {'pseudocount': pseudocount, 'base': base} self.count_obj = count_obj self.plog_counts = numpy.log(count_obj.counts+pseudocount)/numpy.log(base) @property def output(self): return self.plog_counts @property def properties(self): return {'num_length': len(self.plog_counts), 'params': self['params']} def vst(count_obj) : ''' Variance Stabilizing Transformation implemented in the DESeq2 bioconductor package. Parameters ---------- count_obj : CountMatrix object count matrix object Returns ------- pandas.DataFrame VST transformed counts dataframe with the same dimensionality as input counts ''' obj = VstCounts(count_obj) return obj.output class VstCounts(DetkModule): @require_r('DESeq2','SummarizedExperiment') def __init__(self, count_obj): self.count_obj = count_obj script = '''\ library(DESeq2) library(SummarizedExperiment) cnts <- as.matrix(read.csv(counts.fn,row.names=1)) colData <- data.frame(name=seq(ncol(cnts))) dds <- DESeqDataSetFromMatrix(countData = cnts, colData = colData, design = ~ 1) dds <- varianceStabilizingTransformation(dds) write.csv(assay(dds),out.fn) ''' with wrapr(script, counts=count_obj.counts, raise_on_error=True) as r : vsd_values = r.output self.vsd_values = vsd_values @property def output(self): return self.vsd_values @property def properties(self): return {'num_length': len(self.vsd_values) } def rlog(count_obj, blind=True) : ''' Regularized log (rlog) transformation implemented in the DESeq2 bioconductor package. Parameters ---------- count_obj : CountMatrix object count matrix object blind : bool the `blind` parameter as passed to the `rlog` function in DESeq2. if False, `count_obj` is expected to have `column_data` and a valid design as required by the R function Returns ------- pandas.DataFrame rlog transformed counts dataframe with the same dimensionality as input counts ''' obj = RlogCounts(count_obj, blind) return obj.output class RlogCounts(DetkModule): @require_r('DESeq2','SummarizedExperiment') def __init__(self, count_obj, blind=True): self['params'] = {'blind': blind } self.count_obj = count_obj script = '''\ library(DESeq2) library(SummarizedExperiment) cnts <- as.matrix(read.csv(counts.fn,row.names=1)) rnames <- rownames(cnts) # DESeq2 whines when input counts aren't integers # round the counts matrix cnts <- data.frame(apply(cnts,2,function(x) { round(as.numeric(x)) })) rownames(cnts) <- rnames # load design matrix if(file.info(metadata.fn)$size != 0) { colData <- read.csv(metadata.fn,header=T,as.is=T,row.names=1) } else { # just to convince DESeq2 that everything is ok when we're doing a # blind rlog n.fake.class.1 <- floor(ncol(cnts)/2) fake.classes <- factor(c( rep(0,n.fake.class.1), rep(1,ncol(cnts)-n.fake.class.1)) ) colData <- data.frame(name=fake.classes) } blind <- params$blind form <- params$design dds <- DESeqDataSetFromMatrix(countData = cnts, colData = colData, design = formula(form) ) dds <- rlog(dds,blind=blind) write.csv(assay(dds),out.fn) ''' column_data = None if not blind and count_obj.column_data is not None : column_data = count_obj.design_matrix.full_matrix params = { 'design': '~ 1' if blind else count_obj.design, 'blind': blind } with wrapr(script, counts=count_obj.counts, metadata=column_data, params=params, raise_on_error=True) as r : vsd_values = r.output self.vsd_values = vsd_values @property def output(self): return self.vsd_values @property def properties(self): return {'num_length': len(self.vsd_values) } @stub def ruvseq(count_obj) : pass def main(argv=sys.argv) : if '--version' in argv : from .version import __version__ print(__version__) return # add the common opts to the docopt strings cmd_opts_aug = {} for k,v in cmd_opts.items() : cmd_opts_aug[k] = _cli_doc(v) if len(argv) < 2 or (len(argv) > 1 and argv[1] not in cmd_opts) : docopt(_cli_doc(__doc__)) argv = argv[1:] cmd = argv[0] if cmd == 'vst' : args = docopt(cmd_opts_aug['vst'],argv) count_obj = CountMatrixFile(args['<count_fn>']) out = VstCounts(count_obj) if cmd == 'plog' : args = docopt(cmd_opts_aug['plog'],argv) count_obj = CountMatrixFile(args['<count_fn>']) print(args) out = PlogCounts( count_obj, pseudocount=float(args['--pseudocount']), base=float(args['--base']) ) elif cmd == 'rlog' : args = docopt(cmd_opts_aug['rlog'],argv) count_obj = CountMatrixFile( args['<count_fn>'], args['<cov_fn>'], design=args['<design>'], strict=args.get('--strict',False) ) if args['--blind'] is None: args['--blind'] = True out = RlogCounts(count_obj, blind=args['--blind']) if args['--output'] == 'stdout' : f = sys.stdout else : f = args['--output'] out.output.to_csv(f,sep='\t') with DetkReport(args['--report-dir']) as r : r.add_module( out, in_file_path=args['<count_fn>'], out_file_path=args['--output'], column_data_path=args.get('--column-data'), workdir=os.getcwd() ) if __name__ == '__main__' : main()