diff --git a/system/dataset.py b/system/dataset.py index 22f9bd9..bb14564 100644 --- a/system/dataset.py +++ b/system/dataset.py @@ -160,6 +160,10 @@ class Dataset: """Returns dataset name""" return self._name + def get_matrix(self): + """Returns internal numeric matrix for dataset.""" + return self._array + def get_all_dims(self): """Returns all dimensions in project""" return self._all_dims diff --git a/workflows/go_workflow.py b/workflows/go_workflow.py index 6d6b1d4..59c0dc6 100644 --- a/workflows/go_workflow.py +++ b/workflows/go_workflow.py @@ -36,6 +36,10 @@ class EinarsWorkflow (Workflow): regression.add_function(Function('pls', 'PLS')) self.add_stage(regression) + explore = Stage('explore', 'Explorative analysis') + explore.add_function(PCAFunction(self)) + self.add_stage(explore) + save = Stage('save', 'Save Data') save.add_function(DatasetSaveFunction()) self.add_stage(save) @@ -191,6 +195,7 @@ class CelFileImportFunction(Function): silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames())) + silent_eval('E <- rma(E)') m = rpy.r('m <- E@exprs') @@ -202,8 +207,40 @@ class CelFileImportFunction(Function): rpy.r.rm(["E", "m"]) if m: - return [dataset.Dataset(m, (('ids', rownames), ('filename', colnames)), "AffyMatrix Data")] + return [dataset.Dataset(m, (('ids', rownames), ('filename', colnames)), name="AffyMatrix Data")] else: logger.log("notice", "No data loaded from importer.") finally: chooser.destroy() + + +class PCAFunction(Function): + """Generic PCA function.""" + def __init__(self, workflow): + Function.__init__(self, 'pca', 'PCA') + self._workflow = workflow + + def run(self, data): + import rpy + + dim_2, dim_1 = data.get_dim_names() + + + silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) + rpy.with_mode(rpy.NO_CONVERSION, rpy.r.assign)("m", data.get_matrix()) + silent_eval("t = prcomp(t(m))") + + T_ids = map(str, range(1, rpy.r("dim(t$x)")[1]+1)) + T = dataset.Dataset(rpy.r("t$x"), [(dim_1, data.get_identifiers(dim_1)), + ("component", T_ids)], name="T") + P = dataset.Dataset(rpy.r("t$rotation"), [(dim_2, data.get_identifiers(dim_2)), + ("component", T_ids)], name="P") + + # cleanup + rpy.r.rm(["t", "m"]) + + loading_plot1 = plots.ScatterPlot(self._workflow.project,P,'ids','component','1','2') + loading_plot2 = plots.ScatterPlot(self._workflow.project,P,'ids','component','3','4') + score_plot = plots.ScatterPlot(self._workflow.project,T,'filename','component','1','2') + + return [T, P, loading_plot1, loading_plot2, score_plot]