diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3990924..bbf1c86 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,7 +25,7 @@ jobs: # against a few system libraries the base image doesn't ship. - if: runner.os == 'Linux' run: sudo apt-get update && sudo apt-get install -y libgl1 libxkbcommon-x11-0 libxcb-cursor0 - - run: pip install "numpy<2" pandas scipy tqdm pytest PyQt5 + - run: pip install "numpy<2" pandas scipy scikit-learn tqdm pytest PyQt5 - run: python -m pytest code/tests -v lint: diff --git a/code/clusterpurity.py b/code/clusterpurity.py new file mode 100644 index 0000000..a6971df --- /dev/null +++ b/code/clusterpurity.py @@ -0,0 +1,108 @@ +""" +MPACT +Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas + +Qt-free dendrogram "purity" coloring: a branch is colored green if every +leaf beneath it shares the same group label -- i.e. that group is a +monophyletic clade, it clustered together before merging with anything +else -- and left at the default color otherwise. Used by the dendrogram tab +to make it visually obvious whether technical replicates of one Sample +cluster tightly together, and separately whether biological replicates of +one Biolgroup are well separated from other groups. + +Default colors are green/magenta rather than the more conventional +green/red -- red-green colorblindness (the most common form) makes the two +indistinguishable; magenta stays distinguishable from green under all +common forms of color vision deficiency. + +This module is Qt-free and unit-tested (see ``tests/test_clusterpurity.py``). +""" + + +def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='magenta', neutral_color='black'): + """Build a ``link_color_func`` for ``scipy.cluster.hierarchy.dendrogram``. + + Three-way coloring, classified by comparing the two children's label + sets (not by simply asking "is the merge result impure", which would + paint every ancestor of a single mixing event false_color all the way to + the root): + + - ``true_color`` ("monophyletic"): the two children's label sets are + identical and contain exactly one label -- every leaf under this link + shares one label. + - ``false_color`` ("polyphyletic"): the two children's label sets + *overlap* (share at least one label) without being identical-and- + singleton -- this is definitive proof that some label's leaves are + split apart by this exact merge (some of that label is on each side), + i.e. genuinely non-monophyletic, not just "still impure from before". + - ``neutral_color``: the two children's label sets are *disjoint* (no + label in common) -- this merge simply joins two regions that don't + contradict each other; it's a clean bridge even if one or both + children are themselves impure from a *different* label's tangle + further down. This is what keeps a single low-level tangle from + cascading false_color all the way up the tree: once a tangled label's + clade stops growing (nothing more of that label to fold in), every + merge above it only ever joins disjoint regions, so it reverts to + ``neutral_color``. + + Args: + Z: linkage matrix (``scipy.cluster.hierarchy.linkage`` or + fastcluster's drop-in) built on observations in the same order + as ``leaf_labels``. + leaf_labels: sequence, length == number of observations clustered by + ``Z``, giving each leaf's group label (e.g. its Sample or + Biolgroup), in the same order as the data passed to ``linkage``. + + Returns: + callable: ``link_color_func(k)`` as expected by ``dendrogram``'s + ``link_color_func`` argument. + """ + n_leaves = len(leaf_labels) + leaf_label_sets = {i: {leaf_labels[i]} for i in range(n_leaves)} + colors = {} + for i, row in enumerate(Z): + a, b = int(row[0]), int(row[1]) + node_id = n_leaves + i + set_a, set_b = leaf_label_sets[a], leaf_label_sets[b] + merged = set_a | set_b + leaf_label_sets[node_id] = merged + if len(merged) == 1: + colors[node_id] = true_color + elif set_a.isdisjoint(set_b): + colors[node_id] = neutral_color + else: + colors[node_id] = false_color + return lambda k: colors.get(k, neutral_color) + + +def purity_summary(Z, leaf_labels): + """Count how many distinct group labels form one pure clade each. + + A label is "pure" only if *every* leaf carrying that label ends up + together in one clade before that clade merges with any other leaf -- + i.e. the group is exactly monophyletic in the dendrogram. (A node whose + descendants are a uniform-but-incomplete subset of a label -- e.g. 2 of + a Sample's 3 technical replicates -- does NOT count: the third + replicate clustering elsewhere means that Sample isn't really pure.) + + Returns: + (n_pure, n_total): number of distinct labels that are fully pure + clades, out of the total number of distinct labels in + ``leaf_labels``. + """ + n_leaves = len(leaf_labels) + leaf_index_sets = {i: frozenset((i,)) for i in range(n_leaves)} + target_sets = { + label: frozenset(i for i in range(n_leaves) if leaf_labels[i] == label) + for label in set(leaf_labels) + } + pure_labels = set() + for i, row in enumerate(Z): + a, b = int(row[0]), int(row[1]) + node_id = n_leaves + i + merged = leaf_index_sets[a] | leaf_index_sets[b] + leaf_index_sets[node_id] = merged + for label, target in target_sets.items(): + if merged == target: + pure_labels.add(label) + return len(pure_labels), len(target_sets) diff --git a/code/groupsets.py b/code/groupsets.py index 6bee426..39789d8 100644 --- a/code/groupsets.py +++ b/code/groupsets.py @@ -122,6 +122,34 @@ def remove(self, index=None): del self._items[index] self.select(self._selected) + def move(self, from_index, to_index): + """Reorder the groupset at ``from_index`` to ``to_index``. + + Both indices are clamped to the valid range; out-of-range or equal + indices are a no-op. Selection follows the moved item, so the + groupset that was selected before the move is still selected after + (by identity, not by index) -- a drag-and-drop reorder shouldn't + change which groupset is being edited. + """ + if not self._items: + return + from_index = max(0, min(from_index, len(self._items) - 1)) + to_index = max(0, min(to_index, len(self._items) - 1)) + if from_index == to_index: + return + selected_item = self.selected + groupset = self._items.pop(from_index) + self._items.insert(to_index, groupset) + if selected_item is not None: + # Identity, not '==' -- GroupSet.__eq__ is value-based, and two + # distinct groupsets can compare equal (e.g. freshly added ones + # before either is edited), so list.index() could pick the wrong + # one. + for i, item in enumerate(self._items): + if item is selected_item: + self._selected = i + break + def update(self, index, *, name=None, src=None, incl=None, excl=None, colour=None): """Overwrite the given fields of the groupset at ``index``.""" groupset = self._items[index] diff --git a/code/main.py b/code/main.py index c9d3528..418e2bd 100644 --- a/code/main.py +++ b/code/main.py @@ -13,11 +13,10 @@ import time import string -import platform from PyQt5 import QtCore, QtWidgets -from PyQt5.QtWidgets import QMainWindow, QSizeGrip, QGraphicsDropShadowEffect, QFileDialog, QListWidgetItem, QColorDialog -from PyQt5.QtCore import (QCoreApplication, QPropertyAnimation, QDate, QDateTime, QMetaObject, QObject, QPoint, QRect, QSize, QTime, QUrl, Qt, QEvent) -from PyQt5.QtGui import QBrush, QColor, QIcon, QPalette, QPainter, QPixmap +from PyQt5.QtWidgets import QMainWindow, QSizeGrip +from PyQt5.QtCore import QObject, Qt +from PyQt5.QtGui import QPixmap from pathlib import Path # Install/verify non-stock dependencies (epam.indigo, UpSetPlot, squarify) @@ -35,14 +34,14 @@ import files from MSFaST import run_MSFaST, analysis_parameters -from groupsets import GroupSet, GroupSetModel, build_query_dict +from groupsets import GroupSetModel, build_query_dict from plotslots import PlotSlotRegistry from paramfields import save_checkbox_fields from csvcache import cached_read_csv, invalidate as invalidate_csv_cache from biogroups import compute_biological_groups from dbsearch import search_npatlas from searchtree import SearchTreePanel -from plotting import plot_abund, show_spectrum, show_featureplt, plot_heatmap, plot_mzrt, plot_samplecorr, kendrick, plot_volcano, plot_fc3d, plot_dendrogram, plot_PCA, prev_cv, gen_upsetplt, gen_treemap +from plotting import plot_abund, show_spectrum, show_featureplt, plot_heatmap, plot_mzrt, plot_samplecorr, kendrick, plot_volcano, plot_fc3d, plot_dendrogram, plot_ordination, prev_cv, plot_upset, plot_treemap import getfragdb from indigo import Indigo @@ -71,27 +70,31 @@ -add bypass for plots based on checkmark. possibly use if check: ... else: button.hide() then pass - distribution of CVs on bottom of cvplt? -- add pca option and allow visualization of key features on multivar plt? #TODO# -- in source spectra viewer in tab plot -- do overall data quality score, AUC +- in source spectra viewer in spectrum details tab plot with preexisting in source fragment deconvolution algoirthm +- clean up import sections and general code for better maintability and good syntax/standards + ~main.py's own import section done (dead PyQt5/stdlib/groupsets imports removed, + verified unused via pyflakes + grep, no behavior change); other files not yet swept +- do overall data quality score, AUC on CV plot or something, may be present in a different form already - standardize method and class names -- database management, options -- fix up analysisinfo file output - -- mzmine msp file import -- add other ordination options +- add terminal output with current line to status bar instead of just static status messages, perhaps with expand button to show full terminal output +- potentially consider other database options like HMDB etc +- fix up analysisinfo file output with better and more useful log ingo +- add other ordination options like pca, pls-da, etc etc - add custom keyword arguments for each plot to make calling them easier -- add runcheck before searching when switching to search tab -- Figure out way to have only active plot be updated and then to update others - when plot is switched -- make it so groups can be reordered +- make it so groups can be reordered in the groupsets widgets? + ~model-layer support done: GroupSetModel.move() (groupsets.py), tested in + test_groupsets.py. UI drag-drop wiring (listWidget_pltgrps InternalMove + + syncing its rowsMoved signal to model.move()) not done -- needs a live + GUI session to verify the selection-tracking interacts correctly with + updatesets()'s existing blockSignals dance, which isn't something to + guess at unverified - consider if indexing and feature highly functions in plot options have any easy wins for optimization or disk use. (prob not) - make goto buttons just one class and lambda an index for the stacked widgets when connecting! - +likely items that need more thought and planning - maybe have a comparison mode for many different strains with and without elicitor - specificity/sensitivity plot - other statistical models @@ -238,7 +241,15 @@ def __init__(self): self.ui.setupUi(self) self.ui.label_credits.setText('v1.00.01 r26.06.29') - + + # "PCA" was a misnomer left over from when this checkbox/button only + # ran NMDS (see plotting.plot_ordination) -- the underlying + # checkBox_pca objectName/analysis_params.PCA attribute stay + # unchanged for .mpct save-file compatibility; only the visible text + # and tooltip change. + self.ui.checkBox_pca.setText('Multivariate') + self.ui.btn_pca.setToolTip('Multivariate Ordination (PCA/NMDS/PLS-DA)') + #initialize other dialog windows self.dialog = dialog() self.ftrdialog = ftrdialog() @@ -759,6 +770,13 @@ def _refresh_highlight(self): ) self.canvas['kmd'].draw_idle() + # Update the multivariate plot's loadings-view highlight (a separate + # concept from its scores view, which highlights a clicked *sample* + # via parent.pickedsample, not a feature -- so this only applies + # when self.pca exists and is currently showing loadings). + if getattr(self, 'pca', None) is not None: + self.pca.highlight_loading(self.pickedfeature, self.highlightcol) + # Update feature plot with the selected feature self.highlight['featureplt'].set_data( [iondict.loc[self.pickedfeature, 'Retention time (min)']], @@ -1038,6 +1056,11 @@ def _generate_plots(self): dfs = self.filtereddfs grpsts = self.groupsets + self._create_or_reset('treemap', 'treemap', + lambda: plot_treemap(self, 'treemap', self.ui.frame_treemap, pltfile, '', ''), + lambda: self.treemap.reset(pltfile, '', '')) + stop_functime('treemap complete') + if params.CVfil: self._create_or_reset('prevcv', 'CV plot', lambda: prev_cv(self, 'cvplt', self.ui.frame_cvplt, 'none', 'none', 'none'), @@ -1055,10 +1078,10 @@ def _generate_plots(self): stop_functime('dendrogram complete') if params.PCA: - self._create_or_reset('pca', 'PCA/NMDS plot', - lambda: plot_PCA(self, 'pca', self.ui.frame_pca, pltfile, '', ''), + self._create_or_reset('pca', 'multivariate ordination plot', + lambda: plot_ordination(self, 'pca', self.ui.frame_pca, pltfile, '', ''), lambda: self.pca.reset(pltfile, '', '')) - stop_functime('nmds complete') + stop_functime('ordination complete') if params.FC3Dplt: self._create_or_reset('fc3d', '3D fold-change plot', @@ -1101,6 +1124,11 @@ def _generate_plots(self): lambda: self.samplecorr.reset(iondictfile, dfs, grpsts)) stop_functime('samplecorr complete') + self._create_or_reset('upset', 'upset plot', + lambda: plot_upset(self, 'upset', self.ui.frame_upset, iondictfile, '', ''), + lambda: self.upset.reset(iondictfile, '', '')) + stop_functime('upsetplt complete') + def run_analysis(self): # Ignore re-clicks while an analysis is already running on the worker thread. if getattr(self, '_analysis_thread', None) is not None and self._analysis_thread.isRunning(): @@ -1153,12 +1181,6 @@ def _on_compute_finished(self): self.ui.btn_run.setEnabled(True) def _finish_analysis(self): - try: - gen_treemap(self) # move back to end - except Exception: - print("not generating tremap due to an error") - stop_functime('treemap complete') - # Used for point opacity based on abundance colouring iondict = cached_read_csv(self.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=[0], index_col=None) self.analysis_paramsgui.maxval = iondict['logmax'].max() @@ -1206,11 +1228,6 @@ def _finish_analysis(self): self.fillfttree() self.dbsearchdone = True - try: - gen_upsetplt(self) - except Exception: - print("not generating upset plot due to an error") - stop_functime('upsetplt complete') self.ui.label_status.setText('Analysis Complete') stop_functime('analysis complete') print('') diff --git a/code/ordination.py b/code/ordination.py new file mode 100644 index 0000000..7829e9c --- /dev/null +++ b/code/ordination.py @@ -0,0 +1,343 @@ +""" +MPACT +Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas + +Qt-free multivariate ordination backend: PCA, NMDS, and PLS-DA on the +samples x features intensity matrix, plus the data prep (technical-replicate +collapsing) and loadings-selection logic the "multivariate" plot tab needs. + +OPLS-DA is intentionally not implemented here: scikit-learn has no native +support, and the only alternatives (the unmaintained ``pyopls`` package, or a +from-scratch orthogonal-signal-correction implementation) are both +meaningfully riskier than PCA/NMDS/PLS-DA without a reference dataset to +validate against. Logged as future work, not started. + +This module is Qt-free and unit-tested (see ``tests/test_ordination.py``). +""" + +import numpy as np +import pandas as pd +from sklearn.cross_decomposition import PLSRegression +from sklearn.decomposition import PCA +from sklearn.metrics import pairwise_distances +from sklearn import manifold + + +def load_ordination_matrix(file, raw_msdata_header, collapse_replicates): + """Load the samples x features intensity matrix used for ordination. + + This is a near-verbatim port of the data-loading half of the original + (dead-checkbox) ``plot_PCA.plot()`` -- deliberately not redesigned, since + the original's row-grouping math is correct (verified empirically in + ``test_ordination.py``/the scratch script, not re-derived by inspection + here -- this header-juggling is genuinely easy to get subtly wrong by + reasoning about it instead of testing it). Only the hardcoded + ``collapsereps = False`` is now a real parameter. + + Args: + file: path to the canonical ``_filtered.csv`` peak table (3-row + header: Biolgroup, Sample, Injection; see devnotes.md). + raw_msdata_header: the same peak table's 3 header rows, read + *raw* (``header=None, index_col=[0,1,2]).iloc[:3,:].transpose()``, + exactly as the original code reads it -- NOT yet renamed or + re-indexed; that happens inside this function (for the + ``collapse_replicates=True`` case, a *different* header -- + read from the freshly-collapsed intermediate file -- is used + instead, matching the original's control flow exactly). + collapse_replicates: if True, average technical replicates (multiple + Injections under the same Sample) together, keeping biological + replicates (distinct Samples) separate. If False, every + Injection is its own row, as-is. + + Returns: + (X, biolgroup): ``X`` is a DataFrame indexed by sample identifier + ('File'; an Injection name, or a Sample name when collapsed), + columns = features. ``biolgroup`` is a Series, same index as ``X``, + mapping each sample to its biological group. + """ + if collapse_replicates: + # Average technical replicates (Injection) while keeping biological + # replicates (Sample) distinct -- level order is Compound, m/z, RT, + # Biolgroup, Sample, Injection (MSFaST.py's msdata_header.columns + # assignment). Round-trips through a CSV (matching the original) + # so the relabeled 3-row header can be read back the same way the + # uncollapsed path reads the real file, rather than hand-deriving + # unstack()'s resulting column-level order. + msdata = pd.read_csv(file, sep=',', header=[0, 1, 2], index_col=[0, 1, 2]) + try: + msdata = msdata.stack([0, 1, 2], future_stack=True) + except TypeError: + msdata = msdata.stack([0, 1, 2]) + msdata = msdata.groupby(level=[0, 1, 2, 3, 4]).mean().unstack(level=[-1, -2]) + collapsed_columns = msdata.columns.to_list() + msdata = msdata.reset_index() + header = [('', '', 'Compound'), ('', '', 'm/z'), ('', '', 'Retention time (min)')] + for elem in collapsed_columns: + header.append((elem[1], '', elem[0])) + msdata.columns = pd.MultiIndex.from_tuples(header) + msdata.to_csv('averagepca.csv', header=True, index=False) + + msdata_header = pd.read_csv('averagepca.csv', sep=',', header=None, + index_col=[0, 1, 2]).iloc[:3, :].transpose() + pcadf = (pd.read_csv('averagepca.csv', sep=',', header=[2], index_col=[0]) + .drop(['m/z', 'Retention time (min)'], axis=1) + .transpose().astype(float).reset_index().rename(columns={'index': 'File'})) + else: + msdata_header = raw_msdata_header + pcadf = (pd.read_csv(file, sep=',', header=[2], index_col=[0]) + .drop(['m/z', 'Retention time (min)'], axis=1) + .transpose().astype(float).reset_index().rename(columns={'index': 'File'})) + + msdata_header.columns = ['Biolgroup', 'Sample', 'Injection'] + msdata_header = msdata_header.set_index('Injection') + + x = pcadf.set_index('File') + biolgroup = pd.Series( + {file_id: msdata_header.loc[file_id, 'Biolgroup'] for file_id in pcadf['File']}, + name='Biolgroup', + ) + biolgroup.index.name = 'File' + return x, biolgroup + + +def replicate_label_components(raw_msdata_header): + """Derive (Biolgroup, BioRep#, TechRep#) for every Injection, for + building short ``biolgroupname_b_s``-style display + labels as an alternative to raw (sometimes long/uninformative) file + names -- used by the dendrogram tab's "Use Sample/Group Names" toggle. + + BioRep# is the 1-based rank of an Injection's Sample among all distinct + Samples sharing the same Biolgroup (first-seen order in the header); + TechRep# is the 1-based rank of the Injection among all Injections + sharing the same Sample (first-seen order). Both are always assigned + starting at 1, so a Biolgroup with only one Sample still gets "_b1", and + a Sample with only one Injection still gets "_s1" -- there's no minimum- + replicate-count special case to handle. + + Args: + raw_msdata_header: the peak table's 3 header rows, read raw + (``header=None, index_col=[0,1,2]).iloc[:3,:].transpose()``) -- + same format ``load_ordination_matrix`` takes. + + Returns: + DataFrame indexed by Injection, columns ``['Biolgroup', 'Sample', + 'BioRep', 'TechRep']`` (``BioRep``/``TechRep`` are 1-based ints). + """ + header = raw_msdata_header.copy() + header.columns = ['Biolgroup', 'Sample', 'Injection'] + + samples_seen_per_biolgroup = {} + biorep_of_sample = {} + for _, row in header.drop_duplicates('Sample').iterrows(): + biolgroup, sample = row['Biolgroup'], row['Sample'] + samples_seen_per_biolgroup.setdefault(biolgroup, []) + samples_seen_per_biolgroup[biolgroup].append(sample) + biorep_of_sample[sample] = len(samples_seen_per_biolgroup[biolgroup]) + + injections_seen_per_sample = {} + techrep_of_injection = {} + for _, row in header.iterrows(): + sample, injection = row['Sample'], row['Injection'] + injections_seen_per_sample.setdefault(sample, []) + injections_seen_per_sample[sample].append(injection) + techrep_of_injection[injection] = len(injections_seen_per_sample[sample]) + + header['BioRep'] = header['Sample'].map(biorep_of_sample) + header['TechRep'] = header['Injection'].map(techrep_of_injection) + return header.set_index('Injection')[['Biolgroup', 'Sample', 'BioRep', 'TechRep']] + + +def autoscale(x): + """Mean-center and scale each feature to unit variance ("UV-scaling" / + "autoscaling" in chemometrics terminology -- the standard pre-treatment + for PCA/PLS-DA on mass-spec intensity data). + + Without this, raw intensities (which can span several orders of + magnitude between features -- confirmed on real example data: feature + standard deviations ranged from ~1.8 to ~10,000, a ~5800x spread) let a + handful of high-abundance features dominate both the apparent + explained-variance and the loadings, regardless of which features + actually separate the biological groups. NMDS is deliberately NOT put + through this -- its Bray-Curtis dissimilarity is conventionally computed + on raw (or relative) abundances, not standardized ones. + """ + std = x.std(axis=0) + std = std.replace(0, 1) # a zero-variance feature would divide by zero; leave it at 0 instead + return (x - x.mean(axis=0)) / std + + +def run_pca(x, n_components): + """PCA on the samples x features matrix, after autoscaling (see + ``autoscale()``) so the result isn't dominated by whichever features + happen to have the largest raw intensity. + + Returns: + (scores, loadings, explained_variance_ratio): ``scores`` is a + DataFrame (index=samples, columns=PC1..PCn); ``loadings`` is a + DataFrame (index=features, columns=PC1..PCn) of each feature's + contribution to each component; ``explained_variance_ratio`` is an + ndarray of length ``n_components``. + """ + x_scaled = autoscale(x) + pca = PCA(n_components=n_components) + scores = pca.fit_transform(x_scaled.values) + columns = [f'PC{i + 1}' for i in range(n_components)] + scores = pd.DataFrame(scores, index=x.index, columns=columns) + loadings = pd.DataFrame(pca.components_.T, index=x.columns, columns=columns) + return scores, loadings, pca.explained_variance_ratio_ + + +def run_nmds(x, n_components): + """Non-metric MDS on Bray-Curtis sample dissimilarities, warm-started + from a metric MDS solution, then rotated onto principal axes purely for + a stable/sensible orientation (this rotation is NOT a second ordination + of the original features -- it doesn't change the NMDS embedding's + inter-point distances, only its axis orientation). + + Returns: + (scores, explained_variance_ratio, stress): ``explained_variance_ratio`` + here is the variance of the *embedded* (already-reduced) NMDS + coordinates explained by each rotated axis -- not, unlike PCA's, a + measure of how much of the original feature-space variance is + captured. Callers should label this distinctly (e.g. "% of + embedding variance") rather than implying it's the same quantity as + PCA's explained variance. + """ + similarities = pairwise_distances(x.values - x.values.mean(), metric='braycurtis') + + mds = manifold.MDS(n_components=n_components, max_iter=3000, eps=1e-9, + random_state=1, dissimilarity="precomputed", n_jobs=1) + pos = mds.fit(similarities).embedding_ + + nmds = manifold.MDS(n_components=n_components, metric=False, max_iter=3000, + eps=1e-12, dissimilarity="precomputed", random_state=1, + n_jobs=1, n_init=1) + npos = nmds.fit_transform(similarities, init=pos) + stress = nmds.stress_ + + pca = PCA(n_components=n_components) + rotated = pca.fit_transform(npos) + columns = [f'NMDS{i + 1}' for i in range(n_components)] + scores = pd.DataFrame(rotated, index=x.index, columns=columns) + return scores, pca.explained_variance_ratio_, stress + + +def nmds_loading_proxy(x, scores): + """Per-feature correlation with each NMDS axis, as a loadings-equivalent. + + NMDS has no linear feature loadings (it's a rank-based embedding, not a + linear projection of the original features) -- this is the standard + ecology "vector fitting" approach (cf. vegan::envfit): correlate each + original feature with each ordination axis and use that as the + loadings-plot substitute. + + Returns: + DataFrame (index=features, columns=same as ``scores``) of Pearson + correlation coefficients. + """ + return pd.DataFrame( + {col: x.corrwith(scores[col]) for col in scores.columns}, + index=x.columns, + ) + + +def similarity_matrix(x, method): + """Pairwise similarity between samples (rows of ``x``, a samples x + features intensity matrix) -- backs the sample-correlation heatmap's + "Method" switcher. + + - ``'Spearman'``: rank correlation of abundance profiles. The + established default for metabolomics QC (robust to the non-normal, + heavy-tailed abundance distributions typical of LC-MS data); values + in [-1, 1]. + - ``'Jaccard'``: 1 - Jaccard distance on which features are detected + (abundance > 0) in each sample, ignoring relative abundance + entirely -- useful when what matters is which compounds were + detected at all rather than how much; values in [0, 1]. + - ``'Bray-Curtis'``: 1 - Bray-Curtis dissimilarity, the standard + abundance-weighted similarity measure in ecology/metabolomics, + computed on raw abundances (same convention as ``run_nmds``'s + dissimilarity, unlike PCA/PLS-DA's autoscaled features); values in + [0, 1]. + + Returns a samples x samples DataFrame. + """ + if method == 'Spearman': + return x.transpose().corr(method='spearman') + x_filled = x.fillna(0) + if method == 'Jaccard': + dist = pairwise_distances((x_filled > 0).values, metric='jaccard') + elif method == 'Bray-Curtis': + dist = pairwise_distances(x_filled.values, metric='braycurtis') + else: + raise ValueError(f'Unknown similarity method: {method!r}') + return pd.DataFrame(1 - dist, index=x.index, columns=x.index) + + +def run_plsda(x, y, n_components): + """PLS-DA: PLS regression of the samples x features matrix against + one-hot-encoded group labels. + + Args: + x: samples x features DataFrame. + y: Series of group labels, indexed the same as ``x``. + n_components: number of PLS components. + + Returns: + (scores, loadings, explained_variance_ratio): shapes match + ``run_pca``'s. scikit-learn doesn't expose an explained-variance + ratio for PLS directly, so it's computed manually here as each + component's X-score variance divided by the total variance of + (autoscaled) ``x`` -- the standard approach for reporting + %-explained on a PLS biplot. + """ + x_scaled = autoscale(x) + y_dummies = pd.get_dummies(y) + # scale=False: we already autoscaled x ourselves (above), consistent + # with run_pca -- letting PLSRegression's default scale=True scale it + # AGAIN (and scale the 0/1 dummy y columns, which doesn't make sense for + # group-membership indicators) would both double-scale x and distort y. + # (scale=False also previously fixed a bug where x_scores_ lived on a + # different scale than the unscaled total-variance denominator below, + # before autoscaling was added -- see devnotes.md.) + pls = PLSRegression(n_components=n_components, scale=False) + pls.fit(x_scaled.values, y_dummies.values) + x_scores = pls.x_scores_ + columns = [f'PLS{i + 1}' for i in range(n_components)] + scores = pd.DataFrame(x_scores, index=x.index, columns=columns) + loadings = pd.DataFrame(pls.x_loadings_, index=x.columns, columns=columns) + + total_variance = np.sum(x_scaled.values ** 2) + component_variance = np.sum(x_scores ** 2, axis=0) + explained_variance_ratio = component_variance / total_variance + return scores, loadings, explained_variance_ratio + + +def top_loadings(loadings, n=25, always_include=()): + """Subset of ``loadings`` to actually draw on a loadings plot. + + High-dimensional data (thousands of features) can't all be plotted + legibly, so this returns only the top ``n`` features by vector magnitude + (Euclidean norm across all of ``loadings``'s columns) -- plus any + feature named in ``always_include``, even if its magnitude wouldn't + otherwise make the cut. That's what lets the app highlight a specific + (possibly tiny) feature on demand without changing the default view. + + Args: + loadings: DataFrame (index=features, columns=components). + n: how many top-magnitude features to include by default. + always_include: iterable of feature names (must be a subset of + ``loadings.index``) to include regardless of magnitude. + + Returns: + DataFrame: subset of ``loadings`` (same columns), index order + preserved from ``loadings``, with at most ``n + len(always_include)`` + rows (fewer if there's overlap or ``loadings`` itself is smaller). + """ + magnitude = np.sqrt((loadings ** 2).sum(axis=1)) + top_n_index = magnitude.nlargest(min(n, len(loadings))).index + forced = [feat for feat in always_include if feat in loadings.index] + keep = top_n_index.union(forced, sort=False) + # Preserve loadings' original row order rather than magnitude-sorted order. + keep = [feat for feat in loadings.index if feat in keep] + return loadings.loc[keep] diff --git a/code/paramfields.py b/code/paramfields.py index fc76a77..eb0100e 100644 --- a/code/paramfields.py +++ b/code/paramfields.py @@ -23,7 +23,6 @@ CHECKBOX_FIELDS = ( ('PCA', ('ui', 'checkBox_pca')), ('Dendrogram', ('ui', 'checkBox_dend')), - ('bootstrap', ('dialog.ui', 'checkBox_bootstrap')), ('MZRTplt', ('ui', 'checkBox_mzrt')), ('KMD', ('ui', 'checkBox_kmd')), ('mdguide', ('dialog.ui', 'checkBox_mdguide')), diff --git a/code/plotting.py b/code/plotting.py index b1b2e3f..137f2a5 100644 --- a/code/plotting.py +++ b/code/plotting.py @@ -9,6 +9,8 @@ import pickle from csvcache import cached_read_csv, invalidate as invalidate_csv_cache +import ordination +import clusterpurity import matplotlib #matplotlib.style.use('ggplot') @@ -29,15 +31,12 @@ import platform from PyQt5 import QtCore, QtGui, QtWidgets from PyQt5.QtCore import (QCoreApplication, QPropertyAnimation, QDate, QDateTime, QMetaObject, QObject, QPoint, QRect, QSize, QTime, QUrl, Qt, QEvent) -from PyQt5.QtGui import (QBrush, QColor, QIcon, QPalette, QPainter, QPixmap) +from PyQt5.QtGui import (QBrush, QColor, QIcon, QPalette, QPainter) from PyQt5.QtWidgets import * from pathlib import Path import scipy.cluster.hierarchy as shc from sklearn.preprocessing import normalize -from sklearn import manifold -from sklearn.metrics import pairwise_distances -from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from matplotlib.patches import Ellipse from filter import listfilter @@ -580,57 +579,169 @@ def plot(self, parent, file, filtereddfs, groupsets): # abundance tied opacity u class plot_samplecorr(ui_plot): """ - The plot_samplecorr class generates a heatmap plot of the Spearman or Pearson correlation between samples. - - Parameters: - - parent: the parent widget for the plot - currplt: the index of the current plot within the parent widget - frame: the parent frame for the plot - file: a path to the file containing the ion dictionary - filtereddfs: a dictionary containing filtered dataframes for each group in the plot - groupsets: a dictionary containing GroupSet objects for each group in the plot - Methods: - - __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): initializes the plot by calling the plot() method with the given parameters - plot(self, parent, file, filtereddfs, groupsets): generates the plot with the given data. Reads the ion dictionary from a csv file and reads the filtered data from a csv file generated by the program. Calculates the Spearman correlation matrix and generates a heatmap plot using the Seaborn library. Adjusts the layout of the plot and draws it on the parent canvas. + Sample-correlation heatmap, with a Method/View/label switcher bar + inserted into the *shared* Group Analysis nav bar (frame_12 / + horizontalLayout_25, alongside the pre-existing "Sets"/"Sample + Correlations" buttons) rather than this plot's own canvas -- unlike + plot_dendrogram/plot_ordination's per-canvas bars, these controls only + apply to this page, and the nav bar is shared with the UpSet plot page. + UIFunctions.switch_grpanalysis_tab (ui_functions.py) greys the controls + out when the UpSet page is active. + + Views (same collapsing semantics as plot_dendrogram/plot_ordination, + via ordination.load_ordination_matrix): + - "Biological Replicates" (default, matches this plot's previous, + checkbox-less behaviour): technical replicates averaged per Sample, + biological replicates kept separate. + - "Individual Injections": no averaging, every injection is its own + row/column. + - "Biological Groups": both technical and biological replicates + averaged together, one row/column per Biolgroup -- "see only + biological groups" with technical-replicate averaging implied. + + Methods (ordination.similarity_matrix): + - "Spearman" (default): rank correlation of abundance profiles. + - "Jaccard": presence/absence (feature detected or not), ignoring + abundance. + - "Bray-Curtis": abundance-weighted, the standard ecology/metabolomics + similarity measure. + + "Use Sample/Group Names" checkbox: same nomenclature as the + dendrogram's -- ``_b_s`` (Individual + Injections), ``_b`` (Biological Replicates), or the + raw Biolgroup name (Biological Groups -- nothing left to shorten). """ + + VIEWS = ('Biological Replicates', 'Individual Injections', 'Biological Groups') + METHODS = ('Spearman', 'Jaccard', 'Bray-Curtis') + def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): ui_plot.__init__(self, parent, currplt, frame) self.parent = parent self.currplt = currplt + self.view = 'Biological Replicates' + self.method = 'Spearman' + self.use_sample_names = False + self._build_grpanalysis_controls(parent) self.plot(parent, file, filtereddfs, groupsets) - + + def _build_grpanalysis_controls(self, parent): + bar = QtWidgets.QWidget() + bar.setStyleSheet(_SWITCHER_BAR_STYLE) + bar.setMaximumHeight(_SWITCHER_BAR_HEIGHT) + layout = QtWidgets.QHBoxLayout(bar) + layout.setContentsMargins(4, 2, 4, 2) + + layout.addWidget(QtWidgets.QLabel('Method:')) + method_combo = QtWidgets.QComboBox() + method_combo.addItems(self.METHODS) + method_combo.setCurrentText(self.method) + method_combo.currentTextChanged.connect(self._on_method_changed) + layout.addWidget(method_combo) + + layout.addWidget(QtWidgets.QLabel('View:')) + view_combo = QtWidgets.QComboBox() + view_combo.addItems(self.VIEWS) + view_combo.setCurrentText(self.view) + view_combo.currentTextChanged.connect(self._on_view_changed) + layout.addWidget(view_combo) + + use_names_check = QtWidgets.QCheckBox('Use Sample/Group Names') + use_names_check.setChecked(self.use_sample_names) + use_names_check.toggled.connect(self._on_use_sample_names_toggled) + layout.addWidget(use_names_check) + + self.method_combo = method_combo + self.view_combo = view_combo + self.use_names_check = use_names_check + self.controls_bar = bar + + # Pushes the new controls to the right of the pre-existing + # Sets/Sample Correlations buttons within the same shared bar, + # rather than editing the generated horizontalLayout_25 itself. + parent.ui.horizontalLayout_25.addStretch(1) + parent.ui.horizontalLayout_25.addWidget(bar) + + def set_controls_enabled(self, enabled): + """Grey the Method/View/Use-Names controls out when the UpSet Plot + page (the nav bar's other tab) is active -- they don't apply there.""" + self.controls_bar.setEnabled(enabled) + + def _on_method_changed(self, method): + self.method = method + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _on_view_changed(self, view): + self.view = view + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _on_use_sample_names_toggled(self, checked): + self.use_sample_names = checked + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _load_matrix(self, parent): + pltfile = parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv') + raw_header = cached_read_csv(pltfile, sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + x, biolgroup = ordination.load_ordination_matrix( + pltfile, raw_header.copy(), collapse_replicates=(self.view != 'Individual Injections')) + if self.view == 'Biological Groups': + x = x.groupby(biolgroup).mean() + return x, raw_header + + def _display_labels(self, raw_header, leaf_names): + """Build short ``Biolgroup_b#[_s#]`` labels, mirroring + plot_dendrogram's ``_display_labels`` -- nomenclature switches on + the active View the same way.""" + if self.view == 'Biological Groups': + return leaf_names # already bare Biolgroup names + components = ordination.replicate_label_components(raw_header) + if self.view == 'Biological Replicates': + per_sample = components.drop_duplicates('Sample').set_index('Sample') + return [f"{per_sample.loc[sample, 'Biolgroup']}_b{per_sample.loc[sample, 'BioRep']}" for sample in leaf_names] + return [ + f"{components.loc[injection, 'Biolgroup']}_b{components.loc[injection, 'BioRep']}_s{components.loc[injection, 'TechRep']}" + for injection in leaf_names + ] + def plot(self, parent, file, filtereddfs, groupsets): - iondict = cached_read_csv(self.parent.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=[0], index_col=None) - msdata = cached_read_csv(self.parent.analysis_paramsgui.outputdir / (self.parent.analysis_paramsgui.filename.stem + '_filtered.csv'), sep=',', header=[0, 1, 2], index_col=[0, 1, 2]) - try: - msdata = msdata.stack([0, 1, 2], future_stack=True).groupby(level=[0, 1, 2, 3, 4]).mean().droplevel(level=3, axis=0).unstack() - except TypeError: - msdata = msdata.stack([0, 1, 2]).groupby(level=[0, 1, 2, 3, 4]).mean().droplevel(level=3, axis=0).unstack() - msdata.index = msdata.index.droplevel([1, 2]) - pmatrix = msdata.corr(method='spearman') + self._last_file = file + self._last_filtereddfs = filtereddfs + self._last_groupsets = groupsets + + x, raw_header = self._load_matrix(parent) + pmatrix = ordination.similarity_matrix(x, self.method) + + leaf_names = pmatrix.columns.tolist() + display_labels = self._display_labels(raw_header, leaf_names) if self.use_sample_names else leaf_names + fig = self.parent.fig[self.currplt] - ax = self.parent.ax[self.currplt] - # Remove any axes left over from a previous run (notably the colorbar - # that sns.heatmap appends). Without this a new colour-legend bar is - # stacked onto the figure every time the plot is regenerated. - for extra_ax in list(fig.axes): - if extra_ax is not ax: - extra_ax.remove() - ax.clear() + # clf() + add_subplot: sns.heatmap permanently shrinks the axes each + # call to make room for its colorbar; removing the extra colorbar axes + # afterwards doesn't restore the original size. Starting fresh each + # call keeps the axes at a consistent width. + fig.clf() + ax = fig.add_subplot(111) + ax.set_facecolor(self.plotbackground) + ax.set_axisbelow(True) + self.parent.ax[self.currplt] = ax + # Spearman is mathematically capable of going negative, but real + # sample-vs-sample correlations in practice cluster tightly positive + # (e.g. 0.7-1.0) -- a -1..1 scale would compress all of that + # meaningful variation into a sliver of the colour range. 0..1 for + # every method keeps the full range informative. sns.heatmap(pmatrix, ax=ax, cmap=self.parent.analysis_paramsgui.colorscheme, vmin=0, vmax=1) ax.tick_params(axis='both', which='both', labelsize=10) ax.set_xticks(range(len(pmatrix.columns))) - ax.set_xticklabels(pmatrix.columns, rotation=90) + ax.set_xticklabels(display_labels, rotation=90) ax.set_yticks(range(len(pmatrix.index))) - ax.set_yticklabels(pmatrix.index, rotation=0) + ax.set_yticklabels(display_labels, rotation=0) ax.axes.get_xaxis().get_label().set_visible(False) ax.axes.get_yaxis().get_label().set_visible(False) + ax.set_title(self.method, fontsize=10) self.parent.fig[self.currplt].subplots_adjust(left=.1, right=.95, bottom=0.15, top=0.9, hspace=0.2, wspace=0.2) self.parent.canvas[self.currplt].draw() - - + + class kendrick(ui_plot): """ The purpose of this class is to plot the mass defect versus the nominal mass of compounds based on the input files and parameters provided. @@ -800,185 +911,485 @@ def plot(self, parent, file, filtereddfs, groupsets): class plot_dendrogram(ui_plot): """ - Dendrogram generation. - - A CSV file of data for clustering is read and code performs hierarchical clustering using the ward method - and the euclidean distance metric. The resulting dendrogram is plotted on the given frame using the parent object's - figure and canvas. The dendrogram can be either regular or bootstrapped depending on the value of the - parent.analysis_paramsgui.bootstrap parameter. The resulting plot is saved to the parent object's figure and - displayed on the canvas. + Dendrogram generation, with combo-box switchers (same pattern as + plot_ordination's method/view bar) for which leaves to cluster and how + to color the branches: + + Views: + - "Technical Replicates": every injection is its own leaf -- branches + are judged for purity against Sample membership, a quick visual QC + for whether technical replicates are tight. + - "Biological Replicates": injections are first averaged per Sample + (same collapsing logic as the ordination tab's "Collapse Technical + Replicates" checkbox, via ordination.load_ordination_matrix), then + leaves are Samples, judged for purity against Biolgroup -- a quick + visual QC for whether biological groups are separable at all, + independent of technical noise. + + Coloring: + - "Purity": green wherever a branch's leaves are entirely one group + (correctly clustered), magenta wherever a branch mixes more than one + group (polyphyletic). Green/magenta rather than the more conventional + green/red since red-green colorblindness can't distinguish the latter. + - "None": plain black dendrogram, no purity coloring or title -- the + tab's original (pre-purity-coloring) appearance. + + Either view/coloring combination can be regular or bootstrapped + (PvClust) depending on the "Bootstrap" checkbox in this tab's own + switcher bar (formerly the plot-config dialog's global "Bootstrap + Analysis" checkbox -- moved here since it only ever applied to this + plot). The purity-coloring math lives in the Qt-free clusterpurity.py. + + A "Use Sample/Group Names" checkbox swaps the leaf labels from the raw + file/injection names (which can be long or uninformative) to + ``_b_s`` (Technical Replicates view) or + ``_b`` (Biological Replicates view, no TechRep# + since replicates are already collapsed) -- see + ``ordination.replicate_label_components()``. """ + VIEWS = ('Technical Replicates', 'Biological Replicates') + COLOR_MODES = ('Purity', 'None') + def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): ui_plot.__init__(self, parent, currplt, frame) self.parent = parent self.currplt = currplt + # Defaults match the plot's previous (injection-level, uncolored) + # behaviour exactly, so existing sessions see no change until they + # explicitly switch the new controls. ``bootstrap`` defaults to True + # to match the checked-on-startup state the old global checkbox was + # forced to in ui_functions.py (its Designer default was actually + # False, overridden at runtime -- True is what users actually saw). + self.view = 'Technical Replicates' + self.color_mode = 'Purity' + self.bootstrap = True + self.use_sample_names = False + self._build_switcher_bar(parent, currplt) self.plot(parent, file, filtereddfs, groupsets) + def _build_switcher_bar(self, parent, currplt): + bar = QtWidgets.QWidget() + bar.setStyleSheet(_SWITCHER_BAR_STYLE) + bar.setMaximumHeight(_SWITCHER_BAR_HEIGHT) + layout = QtWidgets.QHBoxLayout(bar) + layout.setContentsMargins(4, 2, 4, 2) + + layout.addWidget(QtWidgets.QLabel('View:')) + view_combo = QtWidgets.QComboBox() + view_combo.addItems(self.VIEWS) + view_combo.setCurrentText(self.view) + view_combo.currentTextChanged.connect(self._on_view_changed) + layout.addWidget(view_combo) + + layout.addWidget(QtWidgets.QLabel('Color:')) + color_combo = QtWidgets.QComboBox() + color_combo.addItems(self.COLOR_MODES) + color_combo.setCurrentText(self.color_mode) + color_combo.currentTextChanged.connect(self._on_color_mode_changed) + layout.addWidget(color_combo) + + bootstrap_check = QtWidgets.QCheckBox('Bootstrap') + bootstrap_check.setChecked(self.bootstrap) + bootstrap_check.toggled.connect(self._on_bootstrap_toggled) + layout.addWidget(bootstrap_check) + + use_names_check = QtWidgets.QCheckBox('Use Sample/Group Names') + use_names_check.setChecked(self.use_sample_names) + use_names_check.toggled.connect(self._on_use_sample_names_toggled) + layout.addWidget(use_names_check) + layout.addStretch() + + self.view_combo = view_combo + self.color_combo = color_combo + self.bootstrap_check = bootstrap_check + self.use_names_check = use_names_check + parent.pltlayout[currplt].insertWidget(0, bar) + + def _on_view_changed(self, view): + self.view = view + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _on_color_mode_changed(self, color_mode): + self.color_mode = color_mode + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _on_bootstrap_toggled(self, checked): + self.bootstrap = checked + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _on_use_sample_names_toggled(self, checked): + self.use_sample_names = checked + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _display_labels(self, raw_header, textlabels): + """Build short ``Biolgroup_b#[_s#]`` leaf labels in place of the raw + file/injection names, when "Use Sample/Group Names" is checked.""" + components = ordination.replicate_label_components(raw_header) + if self.view == 'Biological Replicates': + per_sample = components.drop_duplicates('Sample').set_index('Sample') + return [f"{per_sample.loc[sample, 'Biolgroup']}_b{per_sample.loc[sample, 'BioRep']}" for sample in textlabels] + return [ + f"{components.loc[injection, 'Biolgroup']}_b{components.loc[injection, 'BioRep']}_s{components.loc[injection, 'TechRep']}" + for injection in textlabels + ] + def plot(self, parent, file, filtereddfs, groupsets): - heirarch = pd.read_csv(file, sep=',', header=[2], index_col=[0]).drop(['m/z', 'Retention time (min)'], axis=1) - data_scaled = normalize(heirarch, axis=0) # normalize features - data_scaled = pd.DataFrame(data_scaled, columns=heirarch.columns, index=heirarch.index) - textlabels = [elem for elem in data_scaled.columns.tolist()] - - if parent.analysis_paramsgui.bootstrap: + self._last_file = file + self._last_filtereddfs = filtereddfs + self._last_groupsets = groupsets + + # PvClust (bootstrap path) expects "variables x objects" -- it + # bootstraps over the rows (features) and transposes internally + # before clustering the columns (the objects/leaves). shc.linkage + # (regular path) expects the opposite, "objects x variables" -- + # build both orientations from the same scaled data below. + if self.view == 'Biological Replicates': + # Collapse technical replicates first -- leaves are Samples, + # purity is judged against Biolgroup. + raw_header = cached_read_csv( + parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), + sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + x, biolgroup = ordination.load_ordination_matrix(file, raw_header.copy(), collapse_replicates=True) + data_scaled = normalize(x.values, axis=1) # normalize each sample's profile + data_scaled = pd.DataFrame(data_scaled, columns=x.columns, index=x.index) # samples x features + textlabels = data_scaled.index.tolist() + leaf_labels = [biolgroup[sample] for sample in textlabels] + data_for_linkage = data_scaled + data_for_pvclust = data_scaled.transpose() + purity_noun = 'biological groups separable' + else: + heirarch = pd.read_csv(file, sep=',', header=[2], index_col=[0]).drop(['m/z', 'Retention time (min)'], axis=1) + data_scaled = normalize(heirarch, axis=0) # normalize features + data_scaled = pd.DataFrame(data_scaled, columns=heirarch.columns, index=heirarch.index) # features x injections + textlabels = data_scaled.columns.tolist() + raw_header = cached_read_csv( + parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), + sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + raw_header.columns = ['Biolgroup', 'Sample', 'Injection'] + sample_of_injection = raw_header.set_index('Injection')['Sample'].to_dict() + leaf_labels = [sample_of_injection[name] for name in textlabels] + data_for_linkage = data_scaled.transpose() + data_for_pvclust = data_scaled + purity_noun = "samples' replicates clustered together" + + if self.bootstrap: # bootstrap dendrogram - pv = PvClust(data_scaled, method="ward", metric="euclidean", nboot=1000, parallel=True) - dend = pv.plot(parent.ax[self.currplt], labels=textlabels) + pv = PvClust(data_for_pvclust, method="ward", metric="euclidean", nboot=1000, parallel=True) + Z = pv.linkage_matrix else: # regular dendrogram - dend = shc.dendrogram(shc.linkage(data_scaled.transpose(), method='ward'), ax=parent.ax[self.currplt], leaf_rotation=90, color_threshold=0, above_threshold_color='black', labels=textlabels) # default leaf label size 16 + Z = shc.linkage(data_for_linkage, method='ward') + + if self.color_mode == 'Purity': + # Green = monophyletic (correctly clustered); magenta = + # polyphyletic (mixes more than one group). Magenta rather than + # the conventional red -- distinguishable from green under + # red-green colorblindness, the most common form. + link_color_func = clusterpurity.purity_link_color_func(Z, leaf_labels, true_color='green', false_color='magenta') + else: + link_color_func = None # plain black dendrogram, scipy's own default + + display_labels = self._display_labels(raw_header, textlabels) if self.use_sample_names else textlabels + + if self.bootstrap: + dend = pv.plot(parent.ax[self.currplt], labels=display_labels, link_color_func=link_color_func) + else: + dend = shc.dendrogram(Z, ax=parent.ax[self.currplt], leaf_rotation=90, color_threshold=0, above_threshold_color='black', link_color_func=link_color_func, labels=display_labels) # default leaf label size 16 + + if self.color_mode == 'Purity': + n_pure, n_total = clusterpurity.purity_summary(Z, leaf_labels) + parent.ax[self.currplt].set_title(f'{n_pure}/{n_total} {purity_noun}', fontsize=10) + # "None" coloring intentionally leaves no title -- this tab had none + # before purity coloring was added. parent.fig[self.currplt].subplots_adjust( left=0.1, right=0.95, bottom=0.35, top=0.9, hspace=0.2, wspace=0.2) parent.canvas[self.currplt].draw() -class plot_PCA(ui_plot): - #plots NMDS data - # should include opion to allow user specified pca colors - # need to fix selection of samples on PCA plot - # should add PCA vs NMDS option +# Shared by plot_dendrogram's and plot_ordination's combo-box switcher bars +# -- page_dend and page_pca both have the same light background +# (rgba(225,225,225,255), see ui_main.py), unlike searchtree.py's filter bar +# (a dark-themed tab) -- dark text on a light/white combo box, not +# searchtree's light-on-dark scheme. +_SWITCHER_BAR_HEIGHT = 32 + +_SWITCHER_BAR_STYLE = """ +QWidget { + background: transparent; +} +QComboBox { + background-color: rgb(255,255,255); + color: rgb(30,30,30); + border: 1px solid rgb(150,150,150); + border-radius: 2px; + padding: 2px; +} +QComboBox:disabled { + background-color: rgb(220,220,220); + color: rgb(150,150,150); + border: 1px solid rgb(195,195,195); +} +QLabel { + color: rgb(30,30,30); + background: transparent; +} +QLabel:disabled { + color: rgb(150,150,150); +} +QCheckBox:disabled { + color: rgb(150,150,150); +} +""" + + +class plot_ordination(ui_plot): + """Multivariate ordination plot: PCA, NMDS, or PLS-DA, with a + scores-vs-loadings view toggle. + + A combo-box switcher bar (built once in ``__init__``, inserted above the + canvas the same way ``SearchTreePanel``'s filter bar is substituted into + a Designer placeholder -- see searchtree.py) lets the user pick the + ordination method and the scores/loadings view; both redraw onto the + same axes via ``self.plot(...)`` rather than rebuilding the canvas. + + The actual math lives in the Qt-free ``ordination.py`` (PCA/NMDS/PLS-DA, + technical-replicate collapsing, top-N loadings selection); this class is + just the Qt plumbing and rendering on top of it. + + The switcher bar also has a "Collapse Replicates" checkbox (formerly the + plot-config dialog's global "Collapse Technical Replicates" checkbox -- + moved here since it only ever applied to this plot). + """ + + METHODS = ('NMDS', 'PCA', 'PLS-DA') + VIEWS = ('Scores', 'Loadings') + def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): ui_plot.__init__(self, parent, currplt, frame) self.parent = parent self.currplt = currplt + # Defaults match the plot's previous (NMDS-only, scores-only) + # behaviour exactly, so existing sessions see no change until they + # explicitly switch the new controls. ``collapse_replicates`` + # defaults to True, matching the old global checkbox's Designer + # default. + self.method = 'NMDS' + self.view = 'Scores' + self.collapse_replicates = True + self.loadings_df = None + self._build_switcher_bar(parent, currplt) self.plot(parent, file, filtereddfs, groupsets) + def _build_switcher_bar(self, parent, currplt): + bar = QtWidgets.QWidget() + bar.setStyleSheet(_SWITCHER_BAR_STYLE) + bar.setMaximumHeight(_SWITCHER_BAR_HEIGHT) + layout = QtWidgets.QHBoxLayout(bar) + layout.setContentsMargins(4, 2, 4, 2) + + layout.addWidget(QtWidgets.QLabel('Method:')) + method_combo = QtWidgets.QComboBox() + method_combo.addItems(self.METHODS) + method_combo.setCurrentText(self.method) + method_combo.currentTextChanged.connect(self._on_method_changed) + layout.addWidget(method_combo) + + layout.addWidget(QtWidgets.QLabel('View:')) + view_combo = QtWidgets.QComboBox() + view_combo.addItems(self.VIEWS) + view_combo.setCurrentText(self.view) + view_combo.currentTextChanged.connect(self._on_view_changed) + layout.addWidget(view_combo) + + collapse_check = QtWidgets.QCheckBox('Collapse Replicates') + collapse_check.setChecked(self.collapse_replicates) + collapse_check.toggled.connect(self._on_collapse_replicates_toggled) + layout.addWidget(collapse_check) + layout.addStretch() + + self.method_combo = method_combo + self.view_combo = view_combo + self.collapse_check = collapse_check + parent.pltlayout[currplt].insertWidget(0, bar) + + def _on_method_changed(self, method): + self.method = method + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _on_view_changed(self, view): + self.view = view + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _on_collapse_replicates_toggled(self, checked): + self.collapse_replicates = checked + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + def plot(self, parent, file, filtereddfs, groupsets): - """Plot principal component analysis (PCA) or NMDS data. - + """(Re)draw the ordination plot for the current method/view. + Args: - parent (QWidget): Parent widget. - currplt (int): Current plot number. - frame (QFrame): Frame to hold the plot. - file (str): Path to the file containing the PCA data. - filtereddfs (list): List of filtered data. - groupsets (list): List of group sets. - - Attributes: - highlightcol (tuple): Tuple containing RGBA values used for highlighting. - event (int): Identifier for the pick event used to select points on the plot. - - Methods: - plot(self, parent, file, filtereddfs, groupsets): Plot the PCA data. - plot_point_cov(self, points, nstd=2, ax=None, **kwargs): Generate an ellipse for the confidence interval. - lighten_color(self, color, amount=0.5): Lighten a given color by a given amount. - plot_cov_ellipse(self, cov, pos, nstd=2, ax=None, **kwargs): Generate an optimized ellipse for the confidence interval. + parent (QWidget): Parent widget (MainWindow). + file (str): Path to the ``_filtered.csv`` peak table. + filtereddfs, groupsets: unused here (kept for the shared + ``_create_or_reset``/``reset`` call signature every plot + class follows). """ parent = self.parent - parent.collapsereps = False#parent.dialog.ui.checkBox_collapsereps.isChecked() - - if parent.collapsereps: - # Average techreps if replicate collapse is selected - msdata = pd.read_csv(file, sep=',', header=[0, 1, 2], index_col=[0, 1, 2]) - try: - msdata = msdata.stack([0, 1, 2], future_stack=True) - except TypeError: - msdata = msdata.stack([0, 1, 2]) - msdata = msdata.groupby(level=[0, 1, 2, 3, 4]).mean().unstack(level=[-1, -2]) - test2 = msdata.columns.to_list() - msdata = msdata.reset_index() - header = [('','','Compound'), ('','','m/z'), ('','','Retention time (min)')] - for elem in test2: - header.append((elem[1], '', elem[0])) - msdata.columns = pd.MultiIndex.from_tuples(header) - msdata.to_csv('averagepca.csv', header=True, index=False) - - msdata_header = pd.read_csv('averagepca.csv', sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() - pcadf = pd.read_csv('averagepca.csv', sep=',', header=[2], index_col=[0]).drop(['m/z', 'Retention time (min)'], axis=1).transpose().astype(float).reset_index().rename(columns={'index': 'File'}) - else: - msdata_header = cached_read_csv(parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() - pcadf = pd.read_csv(file, sep=',', header=[2], index_col=[0]).drop(['m/z', 'Retention time (min)'], axis=1).transpose().astype(float).reset_index().rename(columns={'index': 'File'}) + self._last_file = file + self._last_filtereddfs = filtereddfs + self._last_groupsets = groupsets + + collapse_replicates = self.collapse_replicates + raw_header = cached_read_csv( + parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), + sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + x, biolgroup = ordination.load_ordination_matrix(file, raw_header.copy(), collapse_replicates) + + n_components = max(2, min(len(x) - 1, 10)) - components = len(msdata_header.index) - if components > 10: - components = 10 - msdata_header.columns = ['Biolgroup', 'Sample', 'Injection'] - msdata_header = msdata_header.set_index('Injection') colors = ['red', 'blue', 'black', 'grey', 'purple', 'orange', 'green', 'yellow', 'lime', 'plum', 'teal', 'olivedrab', 'sienna', 'maroon', 'navy', 'lightcoral', 'darkgoldenrod', 'seagreen', 'lightseagreen', 'aqua', 'lightsteelblue', 'slateblue', 'blueviolet', 'plum', 'burlywood', 'salmon', 'aquamarine', 'magenta', 'tan'] colorpos, biolgroupmap = 0, {} - for elem in msdata_header['Biolgroup']: + for elem in biolgroup: if elem not in biolgroupmap and elem != parent.analysis_paramsgui.blnkgrp: ###### delete blank clause OR CHANGE TO THE BLNKFILTER OPTION biolgroupmap[elem] = colors[colorpos] colorpos += 1 - - features = pcadf.columns.values[1:] - x = pcadf[features].values - y = pcadf[['File']].values - x -= x.mean() - similarities = pairwise_distances(x, metric='braycurtis') - - mds = manifold.MDS(n_components=components, max_iter=3000, eps=1e-9, random_state=1, - dissimilarity="precomputed", n_jobs=1) - pos = mds.fit(similarities).embedding_ - - nmds = manifold.MDS(n_components=components, metric=False, max_iter=3000, eps=1e-12, - dissimilarity="precomputed", random_state=1, n_jobs=1, - n_init=1) - npos = nmds.fit_transform(similarities, init=pos) - stress_value = nmds.stress_ - print("NMDS stress: " +str(stress_value)) - - pca = PCA(n_components=components) - nmdspc = pca.fit_transform(npos) - expvar = pca.explained_variance_ratio_ - pcadftest = pd.DataFrame(data=nmdspc) - - ncomplist = list(range(components)) - nmdspc = pd.DataFrame(data=nmdspc, columns=ncomplist) - nmdspc['File'] = pcadf['File'] - nmdspc['Biolgroup'] = '' - for i, elem in enumerate(nmdspc.iloc[:, components]): - nmdspc.iloc[i, components + 1] = msdata_header.loc[elem, 'Biolgroup'] - principalDf = nmdspc.set_index('File') - + + plot_title = None + if self.method == 'PCA': + scores, loadings, expvar = ordination.run_pca(x, n_components) + axis_labels = [f'PC{i + 1} ({100 * expvar[i]:.1f}%)' for i in range(2)] + elif self.method == 'PLS-DA': + scores, loadings, expvar = ordination.run_plsda(x, biolgroup, n_components) + axis_labels = [f'PLS{i + 1} ({100 * expvar[i]:.1f}%)' for i in range(2)] + else: + scores, expvar, stress = ordination.run_nmds(x, n_components) + loadings = ordination.nmds_loading_proxy(x, scores) + # NMDS doesn't canonically report percent-variance-explained the + # way PCA/PLS-DA do (it's a rank-based embedding, not a linear + # decomposition of the feature space) -- stress is the + # conventional thing to report for NMDS instead. + axis_labels = ['NMDS1', 'NMDS2'] + plot_title = f'Stress: {stress:.4f}' + + self.loadings_df = loadings + principalDf = scores.copy() + principalDf['Biolgroup'] = biolgroup + + if self.view == 'Loadings': + self._plot_loadings(parent, loadings, axis_labels) + else: + self._plot_scores(parent, principalDf, biolgroupmap, axis_labels) + + if plot_title: + parent.ax[self.currplt].set_title(plot_title, fontsize=10) + + parent.fig[self.currplt].subplots_adjust(left=.1, right=.9, bottom=0.1, top=0.9, hspace=0.2, wspace=0.2) + parent.canvas[self.currplt].draw() + + def _plot_scores(self, parent, principalDf, biolgroupmap, axis_labels): for elem in biolgroupmap: scatterframe = principalDf[principalDf['Biolgroup'] == elem] - points = scatterframe.iloc[:,[0,1]].to_numpy() + points = scatterframe.iloc[:, [0, 1]].to_numpy() if np.shape(points)[0] > 2: self.plot_point_cov(points, nstd=2, ax=parent.ax[self.currplt], alpha=0.5, color=self.lighten_color(biolgroupmap[elem], 0.3)) - parent.ax[self.currplt].scatter(scatterframe.iloc[:,0], scatterframe.iloc[:,1], color=biolgroupmap[elem], marker='o', s=30, label=str(elem), picker=True) - + parent.ax[self.currplt].scatter(scatterframe.iloc[:, 0], scatterframe.iloc[:, 1], color=biolgroupmap[elem], marker='o', s=30, label=str(elem), picker=True) + parent.highlight[self.currplt], = parent.ax[self.currplt].plot([], [], 'o', markersize=12, color='yellow') - parent.ax[self.currplt].set_xlabel('NMDS1', **self.fcsfont) # (' + str(round(100*expvar[0], 2)) + '%)' - parent.ax[self.currplt].set_ylabel('NMDS2', **self.fcsfont) #(' + str(round(100*expvar[1], 2)) + '%)' - - #following two lines put a hard limit on the axis tick distance - #parent.ax[self.currplt].xaxis.set_major_locator(ticker.MultipleLocator(0.1)) - #parent.ax[self.currplt].yaxis.set_major_locator(ticker.MultipleLocator(0.1)) - + parent.ax[self.currplt].set_xlabel(axis_labels[0], **self.fcsfont) + parent.ax[self.currplt].set_ylabel(axis_labels[1], **self.fcsfont) + self.highlightcol = (0, 0, 0, 0) parent.pickedsample = pd.DataFrame(0, index=['empty'], columns=['empty']) - - def picksample(event): # fix this + + def picksample(event): if _is_duplicate_pick(parent, event): return ind = event.ind - coord = event.artist.get_offsets()[ind,:] - newsample = principalDf.loc[principalDf.iloc[:,0] == coord[0,0], :].loc[principalDf.iloc[:,1] == coord[0,1], :].reset_index() + coord = event.artist.get_offsets()[ind, :] + newsample = principalDf.loc[principalDf.iloc[:, 0] == coord[0, 0], :].loc[principalDf.iloc[:, 1] == coord[0, 1], :].reset_index() if newsample.empty: return - if newsample.iloc[0,0] == parent.pickedsample.iloc[0,0] and self.highlightcol != (0, 0, 0, 0): + if newsample.iloc[0, 0] == parent.pickedsample.iloc[0, 0] and self.highlightcol != (0, 0, 0, 0): self.highlightcol = (0, 0, 0, 0) else: self.highlightcol = 'yellow' - + parent.pickedsample = newsample - parent.ui.lbl_injname.setText('Injection/Sample: ' + str(parent.pickedsample.iloc[0,0])) - parent.highlight[self.currplt].set_data(coord[0,0],coord[0,1]) + parent.ui.lbl_injname.setText('Injection/Sample: ' + str(parent.pickedsample.iloc[0, 0])) + parent.highlight[self.currplt].set_data(coord[0, 0], coord[0, 1]) parent.highlight[self.currplt].set_color(self.highlightcol) parent.canvas[self.currplt].draw_idle() - + self.event = parent.canvas[self.currplt].figure.canvas.mpl_connect('pick_event', picksample) - parent.fig[self.currplt].subplots_adjust(left=.1, right=.9, bottom=0.1, top=0.9, hspace=0.2, wspace=0.2) - #x0,x1 = parent.ax[self.currplt].get_xlim() - #0,y1 = parent.ax[self.currplt].get_ylim() - #parent.ax[self.currplt].set_aspect(abs(x1-x0)/abs(y1-y0)) - #parent.ax[self.currplt].set_aspect('equal') parent.ax[self.currplt].legend() - parent.canvas[self.currplt].draw() - + + def _plot_loadings(self, parent, loadings, axis_labels): + """Loadings (biplot-style) view: origin-anchored arrows for the + top-N features by vector magnitude, plus -- regardless of + magnitude -- whichever feature is currently highlighted elsewhere + in the app (``parent.pickedfeature``), so a feature too small to + make the default cut is still visible on demand. + """ + always_include = [parent.pickedfeature] if getattr(parent, 'pickedfeature', '') else [] + # Rank by magnitude within the 2 displayed components only, not the + # full (up to 10-component) loadings -- a feature could rank in the + # overall top-25 purely from a large contribution to some other, + # unplotted component while barely showing up here, displacing a + # feature that's actually prominent in this 2D view. + subset = ordination.top_loadings(loadings.iloc[:, :2], n=25, always_include=always_include) + + # ax.annotate()'s arrows don't reliably drive matplotlib's autoscale + # the way ax.scatter()/ax.plot() do (confirmed empirically: points + # can end up outside the auto-picked view limits), so the axis + # range is set explicitly here instead of relying on autoscale. + # Symmetric around 0 since loadings/correlations are naturally + # origin-centered (a biplot convention). + limit = subset.iloc[:, :2].abs().values.max() * 1.2 if len(subset) else 1.0 + parent.ax[self.currplt].set_xlim(-limit, limit) + parent.ax[self.currplt].set_ylim(-limit, limit) + + for feature, row in subset.iterrows(): + xcoord, ycoord = row.iloc[0], row.iloc[1] + parent.ax[self.currplt].annotate( + '', xy=(xcoord, ycoord), xytext=(0, 0), + arrowprops=dict(arrowstyle='->', color='steelblue', lw=1)) + parent.ax[self.currplt].annotate( + str(feature), xy=(xcoord, ycoord), fontsize=8, color='black') + + # Pre-created empty artist for the highlighted-loading marker, + # following the same convention as the scores view's + # parent.highlight[currplt] -- updated on demand by + # MainWindow._refresh_highlight() via self.highlight_loading(), + # even when the highlighted feature isn't in the default top-25. + self.loadings_highlight, = parent.ax[self.currplt].plot([], [], 'o', markersize=12, color='yellow', zorder=5) + self.highlight_loading(getattr(parent, 'pickedfeature', ''), getattr(parent, 'highlightcol', (0, 0, 0, 0))) + + parent.ax[self.currplt].axhline(0, color='grey', lw=0.5) + parent.ax[self.currplt].axvline(0, color='grey', lw=0.5) + parent.ax[self.currplt].set_xlabel(axis_labels[0], **self.fcsfont) + parent.ax[self.currplt].set_ylabel(axis_labels[1], **self.fcsfont) + + def highlight_loading(self, feature, colour): + """Update the loadings-view highlight marker for ``feature`` (a + no-op outside the loadings view or before it's been drawn once). + + Called from ``MainWindow._refresh_highlight()`` -- the same + pre-create-empty-artist/update-via-set_data convention every other + plot's highlight already follows, just driven by this plot's own + last-computed loadings instead of ``iondict``. + """ + if self.view != 'Loadings' or self.loadings_df is None or not hasattr(self, 'loadings_highlight'): + return + if not feature or feature not in self.loadings_df.index: + self.loadings_highlight.set_data([], []) + else: + row = self.loadings_df.loc[feature] + self.loadings_highlight.set_data([row.iloc[0]], [row.iloc[1]]) + self.loadings_highlight.set_color(colour) + self.parent.canvas[self.currplt].draw_idle() + def plot_point_cov(self, points, nstd=2, ax=None, **kwargs): """Generate an ellipse for the confidence interval. @@ -1140,114 +1551,165 @@ def plot(self, parent, file, filtereddfs, groupsets): parent.canvas[currplt].draw() -def gen_upsetplt(parent): #need to do something to handle groups with names that are substrings of other group names +def _detach_placeholder_widget(frame, old_widget): + """Remove a Designer-placed placeholder widget (and the layout holding + it) from ``frame`` so a fresh layout can be installed in its place. + + Most plot frames in this app start out empty in Designer, so + ``ui_plot.__init__`` can just call ``frame.setLayout(...)`` directly. + ``frame_treemap``/``frame_upset`` are the exception -- Designer gave + them a layout with a placeholder ``QLabel`` (the old static-image + target) already in it, and Qt refuses ``setLayout()`` on a frame that + already has one. Reparenting the old layout onto a throwaway widget + (the standard Qt trick for "delete this layout") detaches it from + ``frame`` without touching anything else -- same runtime + widget-substitution pattern as searchtree.py's filter-bar swap. + """ + old_layout = frame.layout() + if old_layout is not None: + old_layout.removeWidget(old_widget) + old_widget.setParent(None) + old_widget.deleteLater() + QtWidgets.QWidget().setLayout(old_layout) + + +class plot_treemap(ui_plot): + """Treemap of how many features each enabled filter removed. + + Drawn directly onto a persistent FigureCanvas (same runtime-widget- + substitution + ui_plot pattern as every other plot tab) instead of the + previous ``squarify.plot()`` -> ``savefig('treemap.png')`` -> ``QPixmap`` + round trip into a Designer-placed ``QLabel`` (``label_treemap``) -- that + PNG round trip meant no zoom/pan/save-at-resolution toolbar, and a flat + raster file rewritten at the repo root on every run. """ - Generate an upset plot to visualize sets of compounds in groups. This function also handles groups with names that are substrings of other group names. - Parameters: - parent (object): The parent object that the generated plot will be a child of. + def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): + _detach_placeholder_widget(frame, parent.ui.label_treemap) + ui_plot.__init__(self, parent, currplt, frame) + self.parent = parent + self.currplt = currplt + self.plot(parent, file, filtereddfs, groupsets) - Returns: - None - """ - iondict = cached_read_csv(parent.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=0, index_col=None) - - # Apply filters if required - if parent.analysis_paramsgui.relfil: - iondict = iondict[iondict['pass_relfil']] - if parent.analysis_paramsgui.decon: - iondict = iondict[iondict['pass_insource']] - if parent.analysis_paramsgui.blnkfltr: - iondict = iondict[iondict['pass_blnkfil']] - if parent.analysis_paramsgui.CVfil: - iondict = iondict[iondict['pass_cvfil']] - - # Prepare data for upset plot - iongroups = iondict['groups'].tolist() - freq = {} - biolgroups = [] - for item in iongroups: - if item not in freq: - freq[item] = 0 - freq[item] += 1 - - header = cached_read_csv(parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), sep=',', header=None, index_col=[0, 1, 2]).iloc[0, :] - for elem in header: - if elem not in biolgroups: - biolgroups.append(elem) - - sets = [' ' + elem for elem in list(freq.keys())] - size = list(freq.values()) - setdf = pd.DataFrame({'groups': sets}) - for elem in biolgroups: #have to do this if one group is a substring of another, add space - setdf[elem] = setdf['groups'].str.contains(' ' + elem) - setdf['size'] = size - setdf = setdf.iloc[:, 1:] - setdf = setdf.set_index(biolgroups)['size'] - - # Plot and display the upset plot - with plt.rc_context({"font.size": 8}): - upsetplt = upsetplot.plot(setdf, show_counts='%d', show_percentages=True, sort_categories_by=None) - - figup = upsetplt['matrix'].figure - figup.set_size_inches(5, 4) - figup.set_facecolor((0, 0, 0, 0)) - upsetplt['intersections'].set_facecolor((1, 1, 1, .25)) - figup.savefig('test_upsetplt.png', dpi=150, bbox_inches='tight') - pixmap = QPixmap('test_upsetplt.png') - parent.ui.label_upset.setPixmap(pixmap) - -def gen_treemap(parent): - #generate treemap for visualization of filtering levels - #needed to refilter data and see how df row lengths change to avoid issues with one feature being in multiple filter lists - """ - The gen_treemap function generates a treemap for visualizing filtering levels. The function reads a CSV file containing the - filtered data and another CSV file containing information about the ions. The function then filters the ion data based on - various filter options and calculates the number of ions filtered by each filter. Finally, the function generates a treemap - to display the number of ions that passed each filter and saves it as a PNG file. The treemap is then displayed in a QLabel in the GUI. + def plot(self, parent, file, filtereddfs, groupsets): + msdata_filtered = cached_read_csv( + parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), + sep=',', header=[0, 1, 2], index_col=[0, 1, 2]) + iondict = cached_read_csv(parent.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=[0], index_col=[0]) - Args: - - parent: the parent widget where the treemap will be displayed + fltrcnt, color = {}, [] + current = len(iondict.index) + + if parent.analysis_paramsgui.relfil: + filteredsetsize = len(iondict[iondict['pass_relfil']].index) + fltrcnt['Mispicked'] = current - filteredsetsize + current = filteredsetsize + color.append('#0000ff') + + if parent.analysis_paramsgui.blnkfltr: + filteredsetsize = len(iondict[iondict['pass_blnkfil']].index) + fltrcnt['Blank'] = current - filteredsetsize + current = filteredsetsize + color.append('#00aaaa') + + if parent.analysis_paramsgui.CVfil: + fltrcnt['Nonreproducible'] = len(parent.ionfilters['cv'].ions) + current = current - fltrcnt['Nonreproducible'] + color.append('#ff0000') + + if parent.analysis_paramsgui.decon: + fltrcnt['Insource'] = len(parent.ionfilters['insource'].ions) + color.append('#00aa00') + + fltrcnt['High Quality'] = len(msdata_filtered.index) + color.append('#000000') + + sizes = list(fltrcnt.values()) + total_size = sum(sizes) + labels = [f"{label}\n{size}\n{round(100 * size / total_size, 1)}%" for label, size in fltrcnt.items()] + ax = parent.ax[self.currplt] + ax.clear() + squarify.plot(sizes=sizes, label=labels, color=color, alpha=0.3, text_kwargs={'fontsize': 10}, ax=ax) + ax.axis('off') + parent.canvas[self.currplt].draw() + + +class plot_upset: + """Upset plot of how filtered features distribute across groupsets. + + Drawn directly onto a persistent Figure -- ``upsetplot.plot()`` accepts + an existing ``fig=`` instead of always creating its own -- rather than + the previous ``upsetplot.plot()`` -> ``savefig('test_upsetplt.png')`` -> + ``QPixmap`` round trip into the Designer-placed ``label_upset``. + + Doesn't subclass ``ui_plot``, the same as ``plot_heatmap`` and for the + same reason: ``upsetplot`` lays out several axes (matrix, totals, + intersections, shading) on the figure itself via its own gridspec -- + there's no single "ax" to hand callers the way every scatter/line plot + here has, so ``ui_plot.__init__``'s single pre-made ``ax`` would just be + an unused, overlapping blank axes. """ - plt.clf() - msdata_filtered = cached_read_csv(parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), sep=',', header=[0, 1, 2], index_col=[0, 1, 2]) - fltrcnt, color = {}, [] - iondict = cached_read_csv(parent.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=[0], index_col=[0]) - total = len(iondict.index) - current = total - - if parent.analysis_paramsgui.relfil: - filteredsetsize = len(iondict[iondict['pass_relfil']].index) - fltrcnt['Mispicked'] = current - filteredsetsize - current = filteredsetsize - color.append('#0000ff') - - if parent.analysis_paramsgui.blnkfltr: - filteredsetsize = len(iondict[iondict['pass_blnkfil']].index) - fltrcnt['Blank'] = current - filteredsetsize - current = filteredsetsize - color.append('#00aaaa') - - if parent.analysis_paramsgui.CVfil: - fltrcnt['Nonreproducible'] = len(parent.ionfilters['cv'].ions) - current = current - fltrcnt['Nonreproducible'] - color.append('#ff0000') - - if parent.analysis_paramsgui.decon: - fltrcnt['Insource'] = len(parent.ionfilters['insource'].ions) - color.append('#00aa00') - - fltrcnt['High Quality'] = len(msdata_filtered.index) - color.append('#000000') - - sizes = list(fltrcnt.values()) - total_size = sum(fltrcnt.values()) - labels = [f"{label}\n{size}\n{round(100*size/total_size,1)}%" for label, size in fltrcnt.items()] - - squarify.plot(sizes=sizes, label=labels, color=color, alpha=0.3, text_kwargs={'fontsize': 10}) - plt.axis('off') - plt.savefig('treemap.png', dpi=150, bbox_inches='tight') - pixmap = QPixmap('treemap.png') - parent.ui.label_treemap.setPixmap(pixmap) \ No newline at end of file + + def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): + _detach_placeholder_widget(frame, parent.ui.label_upset) + self.parent = parent + self.currplt = currplt + + parent.fig[currplt] = Figure() + parent.pltlayout[currplt] = QtWidgets.QVBoxLayout() + parent.canvas[currplt] = FigureCanvas(parent.fig[currplt]) + parent.pltlayout[currplt].addWidget(parent.canvas[currplt]) + parent.toolbar[currplt] = NavigationToolbar(parent.canvas[currplt], parent) + parent.toolbar[currplt].setStyleSheet("background-color:rgba(225,225,225,0);") + parent.pltlayout[currplt].addWidget(parent.toolbar[currplt]) + frame.setLayout(parent.pltlayout[currplt]) + + self.plotbackground = (.89, .89, .89, 0) + self.plot(parent, file, filtereddfs, groupsets) + + def plot(self, parent, file, filtereddfs, groupsets): + # upsetplot.plot() lays out fresh axes via its own gridspec on + # whatever figure it's given -- clear the figure first so repeated + # calls (regenerate/Apply) don't pile up axes on top of each other. + parent.fig[self.currplt].clf() + + iondict = cached_read_csv(parent.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=0, index_col=None) + if parent.analysis_paramsgui.relfil: + iondict = iondict[iondict['pass_relfil']] + if parent.analysis_paramsgui.decon: + iondict = iondict[iondict['pass_insource']] + if parent.analysis_paramsgui.blnkfltr: + iondict = iondict[iondict['pass_blnkfil']] + if parent.analysis_paramsgui.CVfil: + iondict = iondict[iondict['pass_cvfil']] + + iongroups = iondict['groups'].tolist() + freq = {} + for item in iongroups: + freq[item] = freq.get(item, 0) + 1 + + header = cached_read_csv( + parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), + sep=',', header=None, index_col=[0, 1, 2]).iloc[0, :] + biolgroups = [] + for elem in header: + if elem not in biolgroups: + biolgroups.append(elem) + + sets = [' ' + elem for elem in freq.keys()] + setdf = pd.DataFrame({'groups': sets}) + for elem in biolgroups: # space-prefix handles one group name being a substring of another + setdf[elem] = setdf['groups'].str.contains(' ' + elem) + setdf['size'] = list(freq.values()) + setdf = setdf.iloc[:, 1:].set_index(biolgroups)['size'] + + with plt.rc_context({"font.size": 8}): + upsetplt = upsetplot.plot(setdf, fig=parent.fig[self.currplt], show_counts='%d', show_percentages=True, sort_categories_by=None) + + parent.fig[self.currplt].set_facecolor(self.plotbackground) + upsetplt['intersections'].set_facecolor((1, 1, 1, .25)) + parent.canvas[self.currplt].draw() + + def reset(self, file, filtereddfs, groupsets): + self.plot(self.parent, file, filtereddfs, groupsets) \ No newline at end of file diff --git a/code/pvclust.py b/code/pvclust.py index 353d301..6943b83 100644 --- a/code/pvclust.py +++ b/code/pvclust.py @@ -187,10 +187,11 @@ def _result(self): columns=['AU', 'BP', 'SE.AU', 'SE.BP', 'pchi', 'v', 'c']) return result - def plot(self, ax, labels=None): #added axis input + def plot(self, ax, labels=None, link_color_func=None): #added axis input """Plot dendrogram with AU BP values for each node""" plot_dendrogram(self.linkage_matrix, - np.array(self.result[['AU', 'BP']]), ax, labels) + np.array(self.result[['AU', 'BP']]), ax, labels, + link_color_func) def seplot(self, pvalue='AU', annotate=False): """p-values vs Standard error plot""" @@ -271,7 +272,7 @@ def find_clusters(self): return clusters -def plot_dendrogram(linkage_matrix, pvalues, axis, labels=None): #added axis input +def plot_dendrogram(linkage_matrix, pvalues, axis, labels=None, link_color_func=None): #added axis input """ Plot dendrogram with AU BP values for each node""" d = dendrogram(linkage_matrix, no_plot=True) xcoord = d["icoord"] @@ -280,34 +281,57 @@ def plot_dendrogram(linkage_matrix, pvalues, axis, labels=None): #added axis inp x = {i: (j[1]+j[2])/2 for i, j in enumerate(xcoord)} y = {i: j[1] for i, j in enumerate(ycoord)} pos = node_positions(y, x) - - plt.figure(figsize=(12, 8)) - plt.tight_layout() set_link_color_palette(['c', 'g']) + # link_color_func, when given, takes priority over color_threshold/ + # above_threshold_color (scipy's own precedence rule) -- that's how the + # dendrogram tab's purity coloring (clusterpurity.py) reaches the + # bootstrap dendrogram too. d = dendrogram(linkage_matrix, labels=labels, above_threshold_color='black', - color_threshold=0, leaf_rotation=90, ax = axis) + color_threshold=0, leaf_rotation=90, ax=axis, + link_color_func=link_color_func) maxval = max(y.values()) ax = axis - for node, (x, y) in pos.items(): #modifications added to scale y axis label shifts + + # AU/BP labels used to be positioned with a fixed x-shift in icoord + # units (e.g. "x-7"). icoord spacing is always 10 units per leaf + # regardless of leaf count, but the AXES' actual pixel width is not -- + # with more leaves squeezed into the same plot width, each icoord unit + # maps to fewer pixels, so a fixed icoord offset shrinks to a fixed + # *fraction* of leaf spacing but an ever-shrinking number of *pixels*, + # eventually overlapping (e.g. "AU"/"BP" merging into "AUBP" once there + # are enough leaves). Anchoring with ha='right'/'left' plus a constant + # offset in POINTS (not data units) keeps a fixed pixel gap regardless + # of leaf count or icoord scale -- no more digit-width-dependent x-shift + # hack for AU values of 100 vs not. Per-node fontsize is similarly + # scaled down as leaf count grows, so neighbouring nodes' labels (which + # do have a fixed minimum icoord-and-therefore-pixel separation) don't + # run into each other either. + n_leaves = len(d['ivl']) + value_fontsize = max(5, min(8, 140 / n_leaves)) + header_fontsize = value_fontsize + 3 + gap_points = 2 + + for node, (nx, ny) in pos.items(): #modifications added to scale y axis label shifts + y_offset = ny + maxval / 200 if node == (len(pos.items())-1): - ax.text(x-6, y+maxval/200, 'AU', fontsize=11, fontweight='bold', - color='purple') - ax.text(x+1, y+maxval/200, 'BP', fontsize=11, fontweight='bold', - color='black') + ax.annotate('AU', xy=(nx, y_offset), xytext=(-gap_points, 0), + textcoords='offset points', ha='right', va='bottom', + fontsize=header_fontsize, fontweight='bold', color='purple') + ax.annotate('BP', xy=(nx, y_offset), xytext=(gap_points, 0), + textcoords='offset points', ha='left', va='bottom', + fontsize=header_fontsize, fontweight='bold', color='black') else: - if pvalues[node][0]*100 == 100: - ax.text(x-10, y+maxval/200, f' {pvalues[node][0]*100:.0f}', fontsize=8, - color='purple', fontweight='bold') - ax.text(x+1, y+maxval/200, f'{pvalues[node][1]*100:.0f}', fontsize=8, - color='black', fontweight='bold') - else: - ax.text(x-7, y+maxval/200, f' {pvalues[node][0]*100:.0f}', fontsize=8, - color='purple') - ax.text(x+1, y+maxval/200, f'{pvalues[node][1]*100:.0f}', fontsize=8, - color='black') -# plt.savefig('dendrogram.pdf') + au_significant = pvalues[node][0] * 100 == 100 + ax.annotate(f'{pvalues[node][0]*100:.0f}', xy=(nx, y_offset), xytext=(-gap_points, 0), + textcoords='offset points', ha='right', va='bottom', + fontsize=value_fontsize, color='purple', + fontweight='bold' if au_significant else 'normal') + ax.annotate(f'{pvalues[node][1]*100:.0f}', xy=(nx, y_offset), xytext=(gap_points, 0), + textcoords='offset points', ha='left', va='bottom', + fontsize=value_fontsize, color='black', + fontweight='bold' if au_significant else 'normal') def node_positions(x, y): diff --git a/code/test_upsetplt.png b/code/test_upsetplt.png deleted file mode 100644 index da8893e..0000000 Binary files a/code/test_upsetplt.png and /dev/null differ diff --git a/code/tests/test_clusterpurity.py b/code/tests/test_clusterpurity.py new file mode 100644 index 0000000..490544e --- /dev/null +++ b/code/tests/test_clusterpurity.py @@ -0,0 +1,119 @@ +"""Unit tests for dendrogram purity coloring (``clusterpurity.py``).""" + +import numpy as np +from scipy.cluster.hierarchy import linkage + +from clusterpurity import purity_link_color_func, purity_summary + + +def _two_clean_groups(): + """6 points: 3 tightly clustered near (0, 0) labeled 'A', 3 tightly + clustered near (10, 10) labeled 'B' -- each group should merge with + itself long before the two groups merge with each other. + """ + data = np.array([ + [0.0, 0.0], [0.1, 0.0], [0.0, 0.1], + [10.0, 10.0], [10.1, 10.0], [10.0, 10.1], + ]) + labels = ['A', 'A', 'A', 'B', 'B', 'B'] + return data, labels + + +def _scattered_pair_linkage(): + """Hand-built linkage (not derived from real coordinates, so the merge + order is exact and unambiguous) reproducing the real-data pattern that + motivated the overlap-based coloring rule: labels P and Q are each + split across two separate leaves that DON'T merge with each other + first (P0, Q0, Q1, P1 -- interleaved, not P0+P1 then Q0+Q1), so neither + P nor Q is monophyletic -- plus an unrelated label R that cleanly joins + in afterward and should NOT show as part of the tangle. + + Leaves: 0=P, 1=Q, 2=Q, 3=P, 4=R. + Merge order: (1,2)=Q+Q pure; (0, that)=P+{Q} disjoint bridge; + (3, that)=P+{P,Q} overlap -- the actual tangle; (4, that)=R+{P,Q} + disjoint again (R was never part of the P/Q mixing). + """ + Z = np.array([ + [1, 2, 0.1, 2], # node 5: Q+Q -> {Q} (pure) + [0, 5, 1.0, 3], # node 6: P + {Q} -> {P,Q} (disjoint) + [3, 6, 2.0, 4], # node 7: P + {P,Q} -> {P,Q} (overlap!) + [4, 7, 3.0, 5], # node 8: R + {P,Q} -> {P,Q,R} (disjoint) + ]) + labels = ['P', 'Q', 'Q', 'P', 'R'] + return Z, labels + + +def test_purity_summary_both_groups_pure(): + data, labels = _two_clean_groups() + Z = linkage(data, method='ward') + n_pure, n_total = purity_summary(Z, labels) + assert (n_pure, n_total) == (2, 2) + + +def test_purity_link_color_func_clean_disjoint_groups_stay_neutral_even_at_root(): + data, labels = _two_clean_groups() + Z = linkage(data, method='ward') + n_leaves = len(labels) + color_func = purity_link_color_func(Z, labels) + + # The final merge (root) joins group A's whole clade with group B's + # whole clade -- their label sets are disjoint ({A} vs {B}, no overlap), + # so this is a clean join, not evidence either group is non- + # monophyletic. It must be the neutral color, NOT the polyphyletic one + # -- two cleanly-resolved groups simply existing in the same tree isn't + # itself a problem. + root_node_id = n_leaves + len(Z) - 1 + assert color_func(root_node_id) == 'black' + + # Every internal node strictly below the root is a within-group merge + # for this dataset -- those links must be the monophyletic color. + for i in range(len(Z) - 1): + node_id = n_leaves + i + assert color_func(node_id) == 'green' + + +def test_purity_link_color_func_overlap_is_the_only_false_color_and_it_does_not_cascade(): + Z, labels = _scattered_pair_linkage() + color_func = purity_link_color_func(Z, labels) + + assert color_func(5) == 'green' # Q+Q, monophyletic + assert color_func(6) == 'black' # P + {Q}: disjoint, clean bridge + assert color_func(7) == 'magenta' # P + {P,Q}: OVERLAP -- the actual tangle + # R joining afterward is disjoint from {P,Q} -- R was never part of the + # P/Q mixing, so this must NOT also render false_color just because it's + # above (contains) the node-7 tangle. This is the specific behaviour + # this rule exists for: a real, low-level tangle must not paint every + # ancestor false_color all the way to the root. + assert color_func(8) == 'black' + + +def test_purity_link_color_func_custom_colors(): + Z, labels = _scattered_pair_linkage() + color_func = purity_link_color_func(Z, labels, true_color='cyan', false_color='magenta', neutral_color='grey') + assert color_func(5) == 'cyan' + assert color_func(6) == 'grey' + assert color_func(7) == 'magenta' + assert color_func(8) == 'grey' + + +def test_purity_summary_one_mismatched_leaf_breaks_purity_for_its_group(): + # Same as the clean two-group case, but one of group A's points is + # actually closest to group B -- A should no longer be reported pure + # (its leaves don't all merge together before meeting a 'B' leaf), + # while B (unaffected) should still be pure. + data = np.array([ + [0.0, 0.0], [0.1, 0.0], [9.9, 9.9], # last "A" point sits with B + [10.0, 10.0], [10.1, 10.0], [10.0, 10.1], + ]) + labels = ['A', 'A', 'A', 'B', 'B', 'B'] + Z = linkage(data, method='ward') + n_pure, n_total = purity_summary(Z, labels) + assert n_pure == 1 + assert n_total == 2 + + +def test_purity_link_color_func_unknown_link_id_falls_back_to_neutral_color(): + data, labels = _two_clean_groups() + Z = linkage(data, method='ward') + color_func = purity_link_color_func(Z, labels) + assert color_func(99999) == 'black' diff --git a/code/tests/test_groupsets.py b/code/tests/test_groupsets.py index 297b1b3..d4c89c4 100644 --- a/code/tests/test_groupsets.py +++ b/code/tests/test_groupsets.py @@ -99,6 +99,78 @@ def test_remove_all_items_leaves_selection_at_negative_one(): assert model.selected is None +# --------------------------------------------------------------------------- # +# GroupSetModel: move (reordering) +# --------------------------------------------------------------------------- # + +def test_move_reorders_items(): + model = GroupSetModel() + model.add('a') + model.add('b') + model.add('c') + model.move(0, 2) + assert [g.name for g in model] == ['b', 'c', 'a'] + + +def test_move_keeps_selection_on_the_moved_item(): + model = GroupSetModel() + model.add('a') + model.add('b') + model.add('c') + model.select(0) # 'a' selected + model.move(0, 2) + assert model.selected.name == 'a' + assert model.selected_index == 2 + + +def test_move_keeps_selection_on_a_different_item_that_shifted_position(): + model = GroupSetModel() + model.add('a') + model.add('b') + model.add('c') + model.select(1) # 'b' selected + model.move(0, 2) # moves 'a' past 'b', so 'b' shifts from index 1 to 0 + assert model.selected.name == 'b' + assert model.selected_index == 0 + + +def test_move_with_equal_indices_is_a_noop(): + model = GroupSetModel() + model.add('a') + model.add('b') + model.move(1, 1) + assert [g.name for g in model] == ['a', 'b'] + + +def test_move_clamps_out_of_range_indices(): + model = GroupSetModel() + model.add('a') + model.add('b') + model.add('c') + model.move(-5, 99) + assert [g.name for g in model] == ['b', 'c', 'a'] + + +def test_move_on_empty_model_is_a_noop(): + model = GroupSetModel() + model.move(0, 1) # must not raise + assert len(model) == 0 + + +def test_move_disambiguates_value_equal_groupsets_by_identity(): + # Two freshly-added default groupsets compare equal (GroupSet.__eq__ is + # value-based, and both start with identical fields), so move() must + # track the selected item by identity, not by list.index()'s '=='. + model = GroupSetModel() + model.add('dup') + model.add('dup') + model.select(0) + first = model.selected + model.move(0, 1) + assert model.selected is first + assert model.selected_index == 1 + + # --------------------------------------------------------------------------- # # GroupSetModel: CRUD # --------------------------------------------------------------------------- # diff --git a/code/tests/test_ordination.py b/code/tests/test_ordination.py new file mode 100644 index 0000000..992434e --- /dev/null +++ b/code/tests/test_ordination.py @@ -0,0 +1,275 @@ +"""Unit tests for the multivariate ordination backend (``ordination.py``). + +Covers data loading/replicate-collapsing (verified against a synthetic +3-header-row peak table with a known technical/biological-replicate +structure -- see the plan for why this is empirically checked rather than +trusted by inspection) and the PCA/NMDS/PLS-DA/top_loadings math against +small synthetic matrices. +""" + +import numpy as np +import pandas as pd +import pytest + +from ordination import ( + load_ordination_matrix, nmds_loading_proxy, replicate_label_components, + run_nmds, run_pca, run_plsda, similarity_matrix, top_loadings, +) + + +# --------------------------------------------------------------------------- # +# load_ordination_matrix / collapse_replicates +# --------------------------------------------------------------------------- # + +def _write_synthetic_filtered_csv(path): + """3 samples (S1, S1b in groupA; S2 in groupB), 3 technical-replicate + injections each (9 injections total) -- enough to tell "collapsed to + one row per Sample" apart from both "per Injection" (9) and "per + Biolgroup" (2, since there are only 2 biolgroups but 3 samples). + """ + with open(path, 'w') as f: + f.write(',,,groupA,groupA,groupA,groupA,groupA,groupA,groupB,groupB,groupB\n') + f.write(',,,S1,S1,S1,S1b,S1b,S1b,S2,S2,S2\n') + f.write('Compound,m/z,Retention time (min),inj1,inj2,inj3,inj4,inj5,inj6,inj7,inj8,inj9\n') + f.write('feat1,100.0,1.0,10,12,11,30,32,31,50,52,51\n') + f.write('feat2,200.0,2.0,5,6,4,15,16,14,20,19,21\n') + + +def _raw_header(path): + return pd.read_csv(path, sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + + +def test_uncollapsed_keeps_one_row_per_injection(tmp_path): + path = tmp_path / 'example_filtered.csv' + _write_synthetic_filtered_csv(path) + x, biolgroup = load_ordination_matrix(path, _raw_header(path), collapse_replicates=False) + assert x.shape == (9, 2) + assert len(biolgroup) == 9 + + +def test_collapsed_averages_technical_not_biological_replicates(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) # 'averagepca.csv' lands here, not the repo + path = tmp_path / 'example_filtered.csv' + _write_synthetic_filtered_csv(path) + x, biolgroup = load_ordination_matrix(path, _raw_header(path), collapse_replicates=True) + + # 3 distinct Samples (S1, S1b, S2) -- not 9 (uncollapsed) and not 2 + # (the number of Biolgroups, which would mean biological replicates got + # wrongly merged too). + assert x.shape[0] == 3 + assert x.shape[1] == 2 + assert biolgroup.nunique() == 2 + assert sorted(biolgroup.unique()) == ['groupA', 'groupB'] + # Two of the three collapsed rows belong to groupA (S1, S1b). + assert (biolgroup == 'groupA').sum() == 2 + assert (biolgroup == 'groupB').sum() == 1 + + +def test_collapsed_values_are_the_mean_of_their_technical_replicates(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + path = tmp_path / 'example_filtered.csv' + _write_synthetic_filtered_csv(path) + x, _ = load_ordination_matrix(path, _raw_header(path), collapse_replicates=True) + + # S1's feat1 replicates are 10, 12, 11 -> mean 11. + s1_row = x.loc[x.index.str.contains('S1') & ~x.index.str.contains('S1b')] + assert s1_row['feat1'].iloc[0] == pytest.approx(11.0) + + +# --------------------------------------------------------------------------- # +# replicate_label_components +# --------------------------------------------------------------------------- # + +def test_replicate_label_components_numbers_bio_and_tech_reps(tmp_path): + # Reuses the same fixture as the collapse tests: groupA has 2 Samples + # (S1, S1b -- BioRep 1 and 2), groupB has 1 Sample (S2 -- BioRep 1, the + # "only one biological replicate" edge case), every Sample has 3 + # Injections (TechRep 1-3). + path = tmp_path / 'example_filtered.csv' + _write_synthetic_filtered_csv(path) + components = replicate_label_components(_raw_header(path)) + + assert components.loc['inj1', ['Biolgroup', 'Sample', 'BioRep', 'TechRep']].tolist() == ['groupA', 'S1', 1, 1] + assert components.loc['inj3', ['Biolgroup', 'Sample', 'BioRep', 'TechRep']].tolist() == ['groupA', 'S1', 1, 3] + assert components.loc['inj4', ['Biolgroup', 'Sample', 'BioRep', 'TechRep']].tolist() == ['groupA', 'S1b', 2, 1] + # groupB has only one Sample -- still BioRep 1, not skipped/blank. + assert components.loc['inj7', ['Biolgroup', 'Sample', 'BioRep', 'TechRep']].tolist() == ['groupB', 'S2', 1, 1] + + +def test_replicate_label_components_single_technical_replicate(tmp_path): + # Edge case: a Sample with only one Injection should still get + # TechRep=1, not be skipped or raise. + path = tmp_path / 'single_techrep_filtered.csv' + with open(path, 'w') as f: + f.write(',,,groupA,groupA,groupB\n') + f.write(',,,S1,S1b,S2\n') + f.write('Compound,m/z,Retention time (min),inj1,inj2,inj3\n') + f.write('feat1,100.0,1.0,10,30,50\n') + components = replicate_label_components(_raw_header(path)) + + assert components.loc['inj1', ['Sample', 'BioRep', 'TechRep']].tolist() == ['S1', 1, 1] + assert components.loc['inj2', ['Sample', 'BioRep', 'TechRep']].tolist() == ['S1b', 2, 1] + assert components.loc['inj3', ['Sample', 'BioRep', 'TechRep']].tolist() == ['S2', 1, 1] + + +# --------------------------------------------------------------------------- # +# run_pca / run_nmds / run_plsda / top_loadings +# --------------------------------------------------------------------------- # + +def _make_low_rank_matrix(): + # 12 samples, 5 features, but only 2 real underlying dimensions of + # variation -- PCA on this should recover ~100% explained variance in + # the first 2 components. + rng = np.random.RandomState(0) + latent = rng.normal(size=(12, 2)) + loading_matrix = rng.normal(size=(2, 5)) + x = pd.DataFrame( + latent @ loading_matrix + rng.normal(scale=0.01, size=(12, 5)), + index=[f's{i}' for i in range(12)], + columns=[f'f{i}' for i in range(5)], + ) + return x + + +def test_pca_recovers_known_low_rank_structure(): + x = _make_low_rank_matrix() + scores, loadings, expvar = run_pca(x, n_components=3) + assert scores.shape == (12, 3) + assert loadings.shape == (5, 3) + # Two real dimensions of variation + tiny noise -> first two components + # should capture almost all the variance. + assert expvar[:2].sum() > 0.99 + + +def test_plsda_separates_two_groups_along_first_component(): + rng = np.random.RandomState(1) + group_a = rng.normal(loc=0, scale=0.5, size=(10, 6)) + group_b = rng.normal(loc=5, scale=0.5, size=(10, 6)) + x = pd.DataFrame( + np.vstack([group_a, group_b]), + index=[f's{i}' for i in range(20)], + columns=[f'f{i}' for i in range(6)], + ) + y = pd.Series(['A'] * 10 + ['B'] * 10, index=x.index) + + scores, loadings, expvar = run_plsda(x, y, n_components=2) + assert scores.shape == (20, 2) + assert loadings.shape == (6, 2) + # The groups are cleanly separated along PLS1: every A score should be + # on one side of 0 and every B score on the other (sign is arbitrary). + pls1 = scores['PLS1'] + assert (pls1[:10] > 0).all() != (pls1[10:] > 0).all() + # A real, well-separated signal should explain a meaningful share of + # variance -- catches the scale=True/scale=False bug (manually + # confirmed against real data: that bug produced ratios on the order of + # 1e-6 instead of comparable-to-PCA's ~0.7). + assert expvar[0] > 0.1 + + +def test_nmds_smoke_test_on_clustered_data(): + rng = np.random.RandomState(2) + cluster_a = rng.normal(loc=0, scale=0.2, size=(6, 8)) + cluster_b = rng.normal(loc=10, scale=0.2, size=(6, 8)) + x = pd.DataFrame( + np.vstack([cluster_a, cluster_b]), + index=[f's{i}' for i in range(12)], + columns=[f'f{i}' for i in range(8)], + ) + scores, expvar, stress = run_nmds(x, n_components=2) + assert scores.shape == (12, 2) + assert len(expvar) == 2 + assert np.isfinite(stress) + assert stress >= 0 + + proxy = nmds_loading_proxy(x, scores) + assert proxy.shape == (8, 2) + assert proxy.values.min() >= -1.0001 and proxy.values.max() <= 1.0001 + + +# --------------------------------------------------------------------------- # +# similarity_matrix +# --------------------------------------------------------------------------- # + +def test_similarity_matrix_spearman_self_correlation_is_one(): + x = pd.DataFrame( + [[1.0, 2.0, 3.0], [3.0, 2.0, 1.0], [1.0, 5.0, 2.0]], + index=['s1', 's2', 's3'], columns=['f1', 'f2', 'f3'], + ) + sim = similarity_matrix(x, 'Spearman') + assert sim.shape == (3, 3) + assert np.allclose(np.diag(sim.values), 1.0) + # s1 and s2 are perfectly rank-anticorrelated. + assert sim.loc['s1', 's2'] == pytest.approx(-1.0) + + +def test_similarity_matrix_jaccard_identical_presence_is_one(): + # s1/s2 detect exactly the same features (different abundances); + # s3 detects none of them. + x = pd.DataFrame( + [[5.0, 0.0, 2.0], [50.0, 0.0, 20.0], [0.0, 0.0, 0.0]], + index=['s1', 's2', 's3'], columns=['f1', 'f2', 'f3'], + ) + sim = similarity_matrix(x, 'Jaccard') + assert sim.loc['s1', 's2'] == pytest.approx(1.0) + assert np.allclose(np.diag(sim.values)[:2], 1.0) + + +def test_similarity_matrix_braycurtis_identical_profiles_is_one(): + x = pd.DataFrame( + [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0], [10.0, 0.0, 0.0]], + index=['s1', 's2', 's3'], columns=['f1', 'f2', 'f3'], + ) + sim = similarity_matrix(x, 'Bray-Curtis') + assert sim.loc['s1', 's2'] == pytest.approx(1.0) + assert sim.loc['s1', 's3'] < sim.loc['s1', 's2'] + + +def test_similarity_matrix_unknown_method_raises(): + x = pd.DataFrame([[1.0, 2.0]], index=['s1'], columns=['f1', 'f2']) + with pytest.raises(ValueError): + similarity_matrix(x, 'Pearson') + + +# --------------------------------------------------------------------------- # +# top_loadings +# --------------------------------------------------------------------------- # + +def _make_loadings(n=30): + rng = np.random.RandomState(3) + return pd.DataFrame( + rng.normal(size=(n, 2)), + index=[f'feat{i}' for i in range(n)], + columns=['PC1', 'PC2'], + ) + + +def test_top_loadings_returns_n_rows_by_default(): + loadings = _make_loadings(30) + top = top_loadings(loadings, n=10) + assert len(top) == 10 + + +def test_top_loadings_includes_forced_feature_outside_top_n(): + loadings = _make_loadings(30) + top = top_loadings(loadings, n=5) + magnitude = np.sqrt((loadings ** 2).sum(axis=1)) + smallest_feature = magnitude.idxmin() + assert smallest_feature not in top.index + + top_forced = top_loadings(loadings, n=5, always_include=[smallest_feature]) + assert len(top_forced) == 6 + assert smallest_feature in top_forced.index + + +def test_top_loadings_forced_feature_already_in_top_n_is_not_duplicated(): + loadings = _make_loadings(30) + magnitude = np.sqrt((loadings ** 2).sum(axis=1)) + largest_feature = magnitude.idxmax() + top = top_loadings(loadings, n=10, always_include=[largest_feature]) + assert len(top) == 10 # already in the top 10, no duplicate row added + + +def test_top_loadings_n_larger_than_available_returns_everything(): + loadings = _make_loadings(5) + top = top_loadings(loadings, n=100) + assert len(top) == 5 diff --git a/code/treemap.png b/code/treemap.png deleted file mode 100644 index c5e68a8..0000000 Binary files a/code/treemap.png and /dev/null differ diff --git a/code/ui_functions.py b/code/ui_functions.py index e0bbf68..148629f 100644 --- a/code/ui_functions.py +++ b/code/ui_functions.py @@ -83,7 +83,14 @@ def uiDefinitions(self): self.ui.frame_plts.hide() self.ui.checkBox_fc.hide() self.ui.checkBox_ttest.hide() - self.dialog.ui.checkBox_bootstrap.setChecked(True) + # "Bootstrap Analysis" and "Collapse Technical Replicates" moved + # from this global plot-config dialog onto their one relevant + # plot's own switcher bar (plot_dendrogram's "Bootstrap" checkbox, + # plot_ordination's "Collapse Replicates" checkbox) -- hide the + # now-orphaned dialog widgets rather than editing the generated + # ui_plotparam.py. + self.dialog.ui.frame_bootstrap.hide() + self.dialog.ui.frame_2.hide() # Top bar functions self.ui.btn_maximize.clicked.connect(lambda: UIFunctions.maximize_restore(self)) @@ -117,8 +124,8 @@ def uiDefinitions(self): self.ui.btn_cvplt.clicked.connect(lambda: self.ui.stackedWidget_review.setCurrentIndex(2)) self.ui.btn_datasummary.clicked.connect(lambda: self.ui.stackedWidget_review.setCurrentIndex(3)) - self.ui.btn_upsetplt.clicked.connect(lambda: self.ui.stackedWidget_grpanalysis.setCurrentIndex(0)) - self.ui.btn_samplecorr.clicked.connect(lambda: self.ui.stackedWidget_grpanalysis.setCurrentIndex(1)) + self.ui.btn_upsetplt.clicked.connect(lambda: UIFunctions.switch_grpanalysis_tab(self, 0)) + self.ui.btn_samplecorr.clicked.connect(lambda: UIFunctions.switch_grpanalysis_tab(self, 1)) #feature info bar functions self.ftrdialog.ui.btn_close.clicked.connect(lambda: self.ftrdialog.hide()) @@ -211,6 +218,8 @@ def goto_search(self): self.dbsearchdone = True stop_functime('dbsearch complete') reset_runtime() + elif not self.analysisrun: + self.error('Run an analysis before searching.') #plotbar functions def goto_review(self): @@ -221,6 +230,15 @@ def goto_review(self): self.dialog.ui.checkBox_applyfilter.hide() + def switch_grpanalysis_tab(self, idx): + """Switch the Group Analysis sub-tab (UpSet Plot=0, Sample + Correlations=1) and grey out plot_samplecorr's Method/View/Use-Names + controls -- shared with btn_upsetplt/btn_samplecorr in frame_12 -- + whenever the UpSet Plot tab is active, since they don't apply there.""" + self.ui.stackedWidget_grpanalysis.setCurrentIndex(idx) + if getattr(self, 'samplecorr', None) is not None: + self.samplecorr.set_controls_enabled(idx == 1) + def goto_upset(self): self.ui.stackedWidget_infobar.setCurrentIndex(1) self.ui.stackedWidget_plot.setCurrentIndex(9) diff --git a/devnotes.md b/devnotes.md index b99c9dd..cb6689e 100644 --- a/devnotes.md +++ b/devnotes.md @@ -107,8 +107,11 @@ that way. Required deps (gate startup): `epam.indigo`→`indigo`, `UpSetPlot`→ (shared save/restore schema for simple `analysis_parameters` checkbox fields), `biogroups.py` (`getgroups()`'s metadata-join/group-derivation core), `dbsearch.py` (`fulldbsearch()`'s NPAtlas ppm-window matching - core). Each corresponding `MainWindow` method is now a thin wrapper: - call the module function, then apply the result to widgets/`self`. + core), `ordination.py` (PCA/NMDS/PLS-DA + technical-replicate collapsing + + top-N loadings selection for the multivariate plot tab), `clusterpurity.py` + (dendrogram branch-purity coloring for the dendrogram tab). Each + corresponding `MainWindow` method is now a thin wrapper: call the module + function, then apply the result to widgets/`self`. - **Runtime widget substitution into a Designer placeholder** is an established pattern here, not a one-off — `plotting.py` does it for every matplotlib canvas (inserted into a Designer-created `QFrame`), and @@ -159,7 +162,8 @@ python -m pytest code/tests -q ``` Covers `filter`, `stats`, `importdependencies`, `translators`, `groupsets`, -`searchtree`. Add tests here for any new Qt-free logic. +`searchtree`, `ordination`, `clusterpurity`. Add tests here for any new +Qt-free logic. `conftest.py` sets `QT_QPA_PLATFORM=offscreen` and provides a session-scoped `qapp` fixture: PyQt5 widgets/models/signals *can* be exercised headlessly via @@ -188,6 +192,383 @@ on load. UI uses `QListWidget` (generated, off-limits), so this is not a true `QAbstractListModel`/`QListView` setup — the "view" side is the existing hand-written widget-sync code in `ui_functions.py`, kept thin. +## Multivariate ordination plot (`plotting.plot_ordination`, `ordination.py`) + +What used to be called "PCA" (`plot_PCA`, `checkBox_pca`/`btn_pca`'s old +tooltip) actually only ran NMDS, with a PCA rotation applied to the NMDS +coordinates purely to orient the axes — not a second ordination of the +original features. `plot_ordination` now genuinely supports PCA, NMDS, and +PLS-DA, switchable via a combo-box bar inserted above the plot canvas (same +runtime widget-substitution pattern as `searchtree.py`'s filter bar — see +above), plus a Scores/Loadings view toggle. The math lives in the Qt-free +`ordination.py` (unit-tested in `tests/test_ordination.py`); `plotting.py` +only handles the combo boxes, axes, and pick events. + +- **Save-file compatibility preserved on purpose**: `analysis_params.PCA` + and `checkBox_pca`'s objectName are unchanged (still pickled into `.mpct` + saves) — only the visible checkbox text/tooltip changed (set at runtime in + `MainWindow.__init__`, same mechanism as `label_credits.setText`). Only + the hand-written class name (`plot_PCA` → `plot_ordination`) changed, + since that's never pickled. +- **"Collapse Technical Replicates" used to be dead** (`plotting.py` had + `parent.collapsereps = False#parent.dialog.ui.checkBox_collapsereps.isChecked()` + — hardcoded off, the real read commented out). Now wired for real via + `ordination.load_ordination_matrix(..., collapse_replicates=...)`. The + collapse logic itself (average technical replicates/Injections, keep + biological replicates/Samples distinct) was ported verbatim from the + original rather than rewritten — its header-relabeling-via-CSV-round-trip + is easy to get subtly wrong by inspection, so it's verified empirically + instead (`test_ordination.py`'s synthetic-replicate-structure test, cross- + checked against real example data with a scratch script during + development). +- **The checkbox itself later moved off the global plot-config dialog**: + `checkBox_collapsereps` only ever affected this one plot, so it's now + `plot_ordination`'s own "Collapse Replicates" checkbox (`self.collapse_replicates`, + default `True`) in its switcher bar, same move as `plot_dendrogram`'s + "Bootstrap" checkbox (see the dendrogram section). The dialog checkbox's + containing frame (`frame_2`) is hidden at runtime rather than edited out + of generated code. This field was never in `paramfields.CHECKBOX_FIELDS` + (wasn't pickled before either), so no save/load behavior changed. +- **Loadings view and high-dimensional data**: thousands of features can't + all be drawn legibly, so only the top-25 by loading-vector magnitude are + shown by default (`ordination.top_loadings()`). Whichever feature is + currently highlighted elsewhere in the app (`MainWindow.pickedfeature`) is + always included regardless of magnitude — `plot_ordination.highlight_loading()`, + called from `MainWindow._refresh_highlight()`, follows the same + pre-create-an-empty-artist/update-via-`set_data()` convention every other + plot's highlight marker already uses. This is a *feature* highlight + (Loadings view), a different concept from the Scores view's existing + *sample* highlight (`parent.pickedsample`, set by clicking a sample point) + — the two views show different kinds of points and were never the same + selection concept. +- **NMDS has no linear feature loadings** (it's a rank-based embedding, not + a linear projection) — its Loadings view uses `ordination.nmds_loading_proxy()`, + per-feature correlation with each NMDS axis (the standard ecology "vector + fitting"/`envfit` approach), not real loadings. **NMDS's axis labels don't + show percent-explained at all** (just "NMDS1"/"NMDS2") — NMDS is a + rank-based embedding, not a linear decomposition of the feature space, so + it doesn't canonically have a %-variance-explained quantity the way + PCA/PLS-DA do; the plot title shows stress instead, the conventional NMDS + fit-quality metric. +- **PCA/PLS-DA are autoscaled** (`ordination.autoscale()`: mean-center + + scale each feature to unit variance) before fitting — without this, raw + intensities (confirmed on real example data: feature standard deviations + ranged from ~1.8 to ~10,000, a ~5800x spread) let a handful of + high-abundance features dominate both the apparent explained-variance and + the loadings, drowning out features that actually separate the + biological groups but happen to have lower raw intensity. This is the + standard chemometrics pretreatment for PCA/PLS-DA on this kind of data; + NMDS is deliberately NOT autoscaled (its Bray-Curtis dissimilarity is + conventionally computed on raw/relative abundances). +- **PLS-DA's explained-variance gotcha**: `sklearn.cross_decomposition.PLSRegression` + defaults to `scale=True` (standardizes X internally), which -- before + autoscaling was added -- silently produced explained-variance ratios off + by ~6 orders of magnitude when compared against unscaled total variance, + caught only by running against real data, not by inspection. Fixed with + `scale=False` and autoscaling `x` ourselves first instead (matching PCA's + treatment, and avoiding PLSRegression's `scale=True` also incorrectly + scaling the 0/1 group-membership dummy columns). +- **Loadings-view rendering gotchas** (both only surfaced by checking + against real data, not by inspection): (1) `ax.annotate()`-drawn arrows + don't reliably participate in matplotlib's autoscale the way + `ax.scatter()`/`ax.plot()` do — confirmed empirically that plotted arrow + tips could fall outside the axis' auto-picked view limits — so + `plot_ordination._plot_loadings()` now sets `ax.set_xlim`/`set_ylim` + explicitly from the actually-plotted subset's coordinates, symmetric + around 0. (2) `ordination.top_loadings()` must be called with only the 2 + displayed components (`loadings.iloc[:, :2]`), not the full (up to + 10-component) loadings — ranking by overall magnitude across all + components could let a feature into the "top 25" purely from a large + contribution to some unplotted component while barely showing up in the + actual PC1-vs-PC2 view, displacing a feature that's genuinely prominent + there. +- **OPLS-DA intentionally not implemented**: no native scikit-learn support; + the alternatives (the unmaintained `pyopls` package, or a from-scratch + orthogonal-signal-correction implementation) are both riskier than + shipping PCA/NMDS/PLS-DA without a reference dataset to validate against. + Logged here as the next ordination method to add if ever revisited, not + started. + +## Dendrogram purity coloring (`plotting.plot_dendrogram`, `clusterpurity.py`) + +The dendrogram tab has a switcher bar (same runtime-widget-substitution +pattern as `plot_ordination`'s method/view bar) with two combo boxes (View, +Color) and two checkboxes (Bootstrap, Use Sample/Group Names -- both +documented further down, formerly/newly local to this tab respectively): + +- **View** — which leaves to cluster: + - **Technical Replicates** (default — matches the tab's original + behaviour): every Injection is its own leaf, purity judged against + Sample membership — a tight monophyletic clump means that sample's + replicates agree. + - **Biological Replicates**: technical replicates are averaged first + (same `ordination.load_ordination_matrix(..., collapse_replicates=True)` + used by the multivariate tab's checkbox), so leaves are Samples, purity + judged against Biolgroup instead — a monophyletic clade means a whole + biological group's samples cluster together before meeting another + group, i.e. the groups are separable. +- **Color** — how to render purity: + - **Purity** (default): green wherever a branch's leaves are entirely one + group (correctly clustered), magenta wherever a branch mixes more than + one group (polyphyletic) — a QC judgment visible at a glance rather than + read off leaf labels one at a time. The plot title reports + `n_pure/n_total` (e.g. "7/9 samples' replicates clustered together", + "3/3 biological groups separable") via `clusterpurity.purity_summary()`. + - **None**: plain black, no title — deliberately reproduces the tab's + appearance from *before* purity coloring existed (there was no title at + all previously), for anyone who just wants the dendrogram shape without + the QC overlay. Implemented as `link_color_func=None` with + `color_threshold=0` still set (dropping `color_threshold=0` here was a + real regression caught while testing: without it, scipy falls back to + its own default 0.7-of-max-height threshold and a multi-color palette + instead of plain black). + +Both views' purity math is the same Qt-free linkage-traversal logic in +`clusterpurity.py`, unit-tested in `tests/test_clusterpurity.py`. + +- **`false_color` marks proven non-monophyly (overlap), not "any impure + merge"**: two earlier attempts both got this wrong in opposite directions. + First, every impure merge was colored `false_color`, including every + ancestor above a single mixing event all the way to the root -- since + almost any real dataset has *some* mixing somewhere, this painted most of + the tree's upper structure regardless of how localized the problem was. + The second attempt ("impure but at least one child was pure = bridge = + `false_color`, both children already impure = neutral") fixed the worst + of the cascading but still mis-colored real data: it could still mark a + high-level merge `false_color` merely because one side happened to be a + single freshly-introduced pure clade, *and* it could miss real tangles + where two already-impure children share a label without one side being + trivially pure. + + `purity_link_color_func()` now compares the two children's label sets + directly at each merge: + - identical and a single label -> monophyletic (`true_color`/green). + - **disjoint** (no label in common) -> neutral (`neutral_color`/black) -- + a clean join of two regions that don't contradict each other, *even if + one or both children are themselves impure from a different label's + tangle further down*. This is what stops a low-level tangle from + cascading: once a tangled label has nothing more of itself left to fold + in, every merge above it only ever joins disjoint regions, so it goes + back to black. + - **overlap** (share >=1 label, without being identical-and-singleton) -> + polyphyletic (`false_color`/magenta) -- definitive proof that some + label's leaves are split across this exact merge (present on both + sides), not just "still mixed from an earlier merge". + + Verified against the real example dataset's bootstrap dendrogram (the + case that exposed both earlier bugs): only the two merges that actually + re-unite a scattered sample's replicates (e.g. one sample's reps split + into two non-sister sub-clades that only meet again higher up) render + `false_color`; the higher-level merges joining that region with + cleanly-resolved, unrelated samples stay black, same as a hand-built + synthetic linkage (`tests/test_clusterpurity.py`'s + `_scattered_pair_linkage`) reproducing the same pattern deterministically. + + `true_color`/`false_color` default to green/magenta, not the more + conventional green/red: red-green colorblindness (the most common form) + can't distinguish red from green, while magenta stays distinguishable + from green under all common forms of color vision deficiency. (Changed + from an original green/red default after user feedback; see + `clusterpurity.py`'s `purity_link_color_func()` default args and + `plotting.py`'s `plot_dendrogram.plot()` call site.) +- **Bootstrap is now a per-tab checkbox, not a global one**: the + plot-config dialog's "Bootstrap Analysis" checkbox (`checkBox_bootstrap`) + only ever affected this one plot, so it moved into `plot_dendrogram`'s own + switcher bar (`self.bootstrap`, default `True` -- matching the effective + startup default the old checkbox was forced to in `UIFunctions`, which + differed from its own Designer-set default of `False`). The dialog + checkbox's containing frame (`frame_bootstrap`) is hidden at runtime in + `UIFunctions` rather than edited out of the generated `ui_plotparam.py`. + `('bootstrap', ...)` was also dropped from `paramfields.CHECKBOX_FIELDS`, + so it's no longer saved into `.mpct` files -- consistent with the + dendrogram's other per-tab state (View, Color), none of which persist + across save/load either. + +- **Purity is a strict, whole-group check, not "any uniform subset"**: a + label only counts as pure if *every* leaf carrying it ends up in one clade + before that clade touches a different label — 2 of a Sample's 3 replicates + merging together does NOT make that Sample pure if the third replicate + clusters elsewhere. An earlier version of `purity_summary()` got this + wrong (counted a label pure as soon as ANY uniform-label merge occurred, + which is right for `purity_link_color_func`'s per-branch coloring but wrong + for the whole-group summary count) — caught by a test built from a + deliberately "rogue" planted point, not by inspection. +- **PvClust orientation gotcha**: `pvclust.PvClust` expects "variables x + objects" (rows = the things bootstrapped over, i.e. features; it + transposes internally before clustering the columns) — the *opposite* + orientation from `scipy.cluster.hierarchy.linkage`, which expects "objects + x variables". `plot_dendrogram.plot()` builds both orientations + (`data_for_linkage`, `data_for_pvclust`) from the same scaled data rather + than reusing one array, since which one is "transposed" flips between the + Technical (features x injections is the natural read) and Biological + (samples x features, from `load_ordination_matrix`) views. +- **`link_color_func` threaded through the bootstrap path too**: + `PvClust.plot()` and the free function `pvclust.plot_dendrogram()` both + gained a `link_color_func=None` passthrough parameter into their inner + `scipy.cluster.hierarchy.dendrogram()` call, so the AU/BP bootstrap + dendrogram gets the same purity coloring as the regular one (`scipy`'s own + precedence rule: `link_color_func`, when given, overrides + `color_threshold`/`above_threshold_color`). +- **Multiprocessing safety note (re-learned, not new)**: validating the + bootstrap path's wiring during development used `parallel=False` and a + tiny `nboot`, never `PvClust(..., parallel=True)` in an ad hoc script — + `multiprocessing.Pool()` re-executes a script's top-level code in each + spawned child on Windows unless the call site is guarded by + `if __name__ == '__main__':` (the same class of hazard as the frozen-exe + fork-bomb bug elsewhere in this file, just without needing + `freeze_support()` specifically). The real app is fine — `main.py` already + guards its entry point — but throwaway test scripts need the same + discipline. +- **"Use Sample/Group Names" leaf labels** (`ordination.replicate_label_components()`): + swaps the raw file/injection names for `_b_s` + (Technical Replicates view) or `_b` (Biological + Replicates view -- no TechRep#, since that view already collapsed + technical replicates), for when the real file names are long or + uninformative. BioRep# is the 1-based rank of a Sample within its + Biolgroup (first-seen order); TechRep# is the 1-based rank of an + Injection within its Sample -- both numbers are assigned unconditionally, + so a Biolgroup with only one Sample still shows `_b1` and a Sample with + only one Injection still shows `_s1` (no special-casing needed for either + edge case, verified in `test_ordination.py`). This only changes the + `labels=` argument passed to `dendrogram()`/`PvClust.plot()` -- the + underlying data orientation, clustering, and purity-coloring lookups all + still key off the raw names internally. +- **AU/BP label scaling, regardless of leaf count**: the bootstrap + dendrogram's per-node AU/BP annotations used to be positioned with a + fixed x-shift in *icoord* units (e.g. `x-7`, with a separate `x-10` for + 3-digit "100" values). icoord spacing is always 10 units per leaf no + matter how many leaves there are, but the axes' actual pixel width isn't + -- with more leaves squeezed into the same plot width, each icoord unit + maps to fewer and fewer pixels, so that fixed icoord offset shrinks to an + ever-smaller *pixel* gap, eventually merging "AU"/"BP" into "AUBP" (and + every node's AU/BP pair into illegible overlapping text) once there are + enough leaves. Fixed by switching to `ax.annotate(..., xytext=(±2, 0), + textcoords='offset points', ha='right'/'left')`: a constant gap in + *points* (real pixels-at-a-given-DPI) stays a constant gap regardless of + icoord scale or leaf count, and `ha='right'`/`ha='left'` anchoring makes + the old digit-width-dependent branching (-7 vs -10) unnecessary entirely. + Per-node fontsize is also now scaled down as leaf count grows + (`max(5, min(8, 140 / n_leaves))`) so neighbouring *different* nodes' + labels -- which do have a fixed minimum icoord (and therefore pixel) + separation -- don't run into each other either. Verified by rendering + both a 6-leaf and a 27-leaf synthetic tree (matching the real 9-sample + x3-techrep dataset's leaf count) and visually confirming no overlap in + either. Also removed a `plt.figure(figsize=(12,8))`/`plt.tight_layout()` + pair that created and immediately abandoned an unused Figure on every + call -- it never affected the actual target `axis` and was a real (if + small) per-redraw resource leak. + +## Treemap / upset plot canvases (`plotting.plot_treemap`, `plotting.plot_upset`) + +These two tabs used to be the only plots in the app that weren't real +matplotlib canvases: `gen_treemap`/`gen_upsetplt` (free functions, not +`ui_plot` subclasses) drew with `squarify`/`upsetplot`, `savefig()`'d a PNG +to the repo root (`treemap.png`/`test_upsetplt.png`), then loaded that PNG +into a `QPixmap` on the Designer-placed `label_treemap`/`label_upset`. That +meant no zoom/pan/save-at-resolution toolbar, a flat raster rewritten from +scratch on every run, and files left sitting at the repo root. + +Both are now `ui_plot`-style classes (`plot_treemap`/`plot_upset`) drawing +directly onto a persistent `FigureCanvas`, wired into `MainWindow._generate_plots()` +via `_create_or_reset()` exactly like every other plot — so they're created +once and `.reset()` afterward, regenerating on both a fresh analysis run and +the dialog's "Apply" button (`regenerateplts()`), the same as every other +plot. They previously were NOT regenerated by Apply at all (`gen_treemap`/ +`gen_upsetplt` were only ever called once, directly from `_finish_analysis`) +— a small behavior change, but one that brings them in line with how every +other plot already worked, not a new inconsistency. + +- **`frame_treemap`/`frame_upset` needed a different substitution trick**: + unlike most plot frames (empty in Designer, so `ui_plot.__init__` can just + call `frame.setLayout(...)`), these two already have a Designer-built + layout holding the old placeholder `QLabel` — Qt refuses `setLayout()` on + a frame that already has one. `plotting._detach_placeholder_widget()` + removes the old label and reparents the old layout onto a throwaway + widget (the standard Qt "delete this layout" trick) before the normal + `ui_plot.__init__`/manual canvas setup runs — same runtime + widget-substitution pattern as `searchtree.py`'s filter-bar swap, just + with an extra detach step first. Verified headlessly (offscreen Qt) that + this doesn't raise and the frame ends up with exactly the new layout. +- **`plot_upset` doesn't subclass `ui_plot`**, same as `plot_heatmap` and for + the same reason: `upsetplot.plot()` lays out several axes (matrix, + totals, intersections, shading) via its own gridspec on whatever figure + it's given — there's no single "ax" to hand callers the way every + scatter/line plot here has. Unlike `plot_heatmap` (which has to transplant + axes from a brand-new seaborn figure onto the persistent one, since + `sns.clustermap()` doesn't accept an existing figure), `upsetplot.plot()` + takes a `fig=` kwarg directly — `reset()` just `fig.clf()`s and re-plots + onto the same figure, no axes-transplant needed. +- **Verified against real data, not just import-checked**: a throwaway + headless-Qt script built fake Designer-style frames (QFrame + layout + + placeholder QLabel, matching `frame_treemap`/`frame_upset`'s actual + structure), ran both classes against the real example dataset, and + asserted (1) the new canvas/toolbar actually replaced the old layout, (2) + axes count is stable across `.reset()` calls (not growing — would mean + old axes/figures were leaking), and (3) no PNG got written to disk by + either plot anymore. + +## Sample correlation matrix (`plotting.plot_samplecorr`, `ordination.similarity_matrix`) + +Used to be a hardcoded Spearman-only heatmap with technical replicates +always pre-averaged and no way to relabel the raw injection/sample names. +Now has a Method (Spearman/Jaccard/Bray-Curtis) switcher, a View +(Biological Replicates/Individual Injections/Biological Groups) switcher, +and a "Use Sample/Group Names" checkbox — same nomenclature and +`ordination.replicate_label_components()` reuse as `plot_dendrogram`'s. + +- **`ordination.similarity_matrix(x, method)`** is the new Qt-free backend + (covered by `test_ordination.py`): `x` is samples x features, same + convention as `run_pca`/`run_nmds`/`run_plsda`. Spearman is + `x.transpose().corr(method='spearman')`; Jaccard/Bray-Curtis go through + `sklearn.metrics.pairwise_distances` (`metric='jaccard'`/`'braycurtis'`) + and return `1 - distance`. Jaccard is computed on `x > 0` (presence/ + absence of detection, ignoring abundance) — deliberately *not* derived + from the groupset query-dict machinery the user floated as a possible + source, since that's per-feature-list bookkeeping for the UpSet/treemap + tabs, a different concept from per-sample/group detection. + Pearson/Kendall were considered and rejected: Pearson assumes + normally-distributed abundances (the wrong fit for heavy-tailed LC-MS + intensities, same reasoning that makes Spearman the established choice + here), Kendall is a slower, largely redundant rank-correlation + alternative to Spearman. +- **Controls live in the *shared* `frame_12`/`horizontalLayout_25` nav + bar** (the one holding the pre-existing `btn_upsetplt`/`btn_samplecorr` + buttons that switch `stackedWidget_grpanalysis`), not in this plot's own + canvas frame — unlike `plot_dendrogram`/`plot_ordination`'s per-canvas + switcher bars, these controls are specific to the Sample Correlations + page but the nav bar is shared with the unrelated UpSet Plot page. + `plot_samplecorr._build_grpanalysis_controls()` appends a stretch then + its own control widget onto the existing layout (no `ui_main.py` edit) + so the new controls sit to the right of the two buttons and the bar + stays a single row regardless of window width. +- **Greying out on the UpSet Plot tab**: `ui_functions.py`'s + `btn_upsetplt`/`btn_samplecorr` click handlers used to call + `stackedWidget_grpanalysis.setCurrentIndex()` directly; they now route + through `UIFunctions.switch_grpanalysis_tab(self, idx)`, which also calls + `self.samplecorr.set_controls_enabled(idx == 1)` (guarded by + `getattr(self, 'samplecorr', None) is not None`, since this can fire + before any analysis has run and created the plot object). The Designer + default for `stackedWidget_grpanalysis` is already index 1 + (Sample Correlations), so the controls start enabled, matching the + default active page. +- **View → row/column construction**: "Biological Replicates" and + "Individual Injections" reuse `ordination.load_ordination_matrix()` + exactly like `plot_dendrogram` does (`collapse_replicates=True`/`False`). + "Biological Groups" takes the collapsed (Biological Replicates) matrix + and does one more `x.groupby(biolgroup).mean()` to average across + biological replicates too — deliberately not a third mode inside + `load_ordination_matrix` itself, since it's a trivial one-line reduction + of an already-correct, already-tested intermediate result. +- **Heatmap `vmin` is `0` for all three methods**, including Spearman. + Spearman is mathematically capable of going negative, but real + sample-vs-sample correlations in this kind of data cluster tightly + positive (e.g. 0.7-1.0) — a `-1..1` scale (tried first) compressed all of + that meaningful variation into a sliver of the colour range, making the + heatmap look uniformly dark/uninformative. `0..1` keeps the full colour + range usable for the variation that actually occurs. +- Dropped the dead `iondict = cached_read_csv(...)` read that was never + actually used by the old `plot()` body. + ## Conventions - Don't edit the generated UI files (above). Put behaviour in `main.py` / diff --git a/docs/changelog.md b/docs/changelog.md index b58901b..82b796d 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -9,8 +9,39 @@ Metaboscape) and automatic GNPS2-compatible re-indexing of MSP/MGF fragment databases on export. - MVC refactor of "Plot Feature Sets" (`groupsets.py`). -- `fastcluster` optional acceleration for hierarchical clustering/bootstrap - dendrogram. +- Multivariate ordination rework: the former NMDS-only tab now supports + **PCA**, **NMDS**, and **PLS-DA** via a method switcher bar, plus a + scores/loadings view toggle and a loadings highlight synced to feature + selection elsewhere in the app. PCA/PLS-DA features are autoscaled + (mean-center + unit variance) before fitting; NMDS stays on raw + abundances (conventional for Bray-Curtis). A new Qt-free backend + (`ordination.py`) handles all ordination math and is covered by + headless unit tests. +- Dendrogram rework: + - Per-plot switcher bar for **View** (Technical vs Biological + Replicates), **Color** (Purity / None), **Bootstrap**, and + **Use Sample/Group Names**. + - Purity coloring: green branches are cleanly within-sample/group; + magenta branches mark the exact merge point where two groups' leaves + overlap (proven non-monophyly) — magenta rather than the more + conventional red, since red-green colorblindness (the most common + form) can't distinguish red from green. A title reports how many + samples/groups are fully correctly clustered. + - "Use Sample/Group Names" checkbox: replaces raw injection/file names + with `_b<#>_s<#>` (Technical Replicates view) or + `_b<#>` (Biological Replicates view). + - Bootstrap can be toggled off for a faster undecorated dendrogram. + - Fixed AU/BP annotation alignment: labels now stay a constant pixel + gap apart regardless of leaf count, with leaf-count-scaled font size. + - `fastcluster` optional acceleration for bootstrap linkage. +- Sample Correlation Matrix rework: a **Method** switcher (Spearman / + Jaccard / Bray-Curtis), a **View** switcher (Individual Injections / + Biological Replicates / Biological Groups), and a "Use Sample/Group + Names" checkbox, all in the nav bar shared with the UpSet Plot tab + (greyed out while that tab is active). The heatmap scale is fixed to + 0-1 for all three methods. +- UpSet and treemap plots are now rendered directly on a Qt canvas + (replacing a PNG round-trip). - Headless unit test suite (`code/tests/`). - Hardened dependency installer to prevent NumPy 2.x environment breaks. - Numerous bugfixes: Spearman double-colorbar on re-run, highlight not diff --git a/docs/development.md b/docs/development.md deleted file mode 100644 index 246d38e..0000000 --- a/docs/development.md +++ /dev/null @@ -1,119 +0,0 @@ -# Development - -This page is for people contributing to MPACT itself, not end users. For -the authoritative, most up-to-date version of these notes (kept alongside -the code), see [`devnotes.md`](https://github.com/BalunasLab/mpact/blob/main/devnotes.md) -in the repo root. - -## Architecture - -- **Generated — do not edit:** `ui_main.py`, `ui_main1.py`, - `ui_featureinfo.py`, `ui_plotparam.py`, `files.py`, `files_rc.py`. These - are Qt Designer output and get overwritten on regeneration. (Despite the - name, `ui_functions.py` is hand-written and fully editable — it's the - `UIFunctions` controller class.) -- **Hand-written app code:** `main.py` (`MainWindow`, run/save/load, - database search), `plotting.py` (plot classes), `filter.py`, `stats.py`, - `MSFaST.py` (analysis driver), `pvclust.py` (bootstrap dendrogram), - `translators.py` (import/export framework), `mzmineimport.py` (format - conversion), `getfragdb.py`, `mspwriter.py`. -- **Canonical peak table** format (what `MSFaST` consumes internally; - Progenesis is the native/baseline format): CSV with 3 header rows, row 2 - = `Compound,m/z,Retention time (min),`, col0 = `RT_mz` id, - col1 = m/z, col2 = RT. - -## Threading model - -`run_MSFaST` is Qt-free and runs on a `QThread` worker (`AnalysisWorker` in -`main.py`), so the GUI stays responsive during the heavy compute. -`MainWindow.run_analysis` reads widgets on the main thread, starts the -worker, and `_finish_analysis` does all matplotlib/Qt plotting back on the -**main thread** (matplotlib is not thread-safe). Never create Qt/matplotlib -objects on the worker thread. - -## Importer/translator framework (`translators.py`) - -Qt-free and unit-tested: `detect_peaktable_format`, `parse_msp`/ -`parse_mgf` (→ `FragmentEntry`), `reindex_fragments` (matches fragments to -peak-table rows by compound ID first, then m/z+RT — Progenesis MSP stores -neutral mass, not adduct m/z), `filter_source_peaktable` (row-subsets the -source peak table to surviving features). `mzmineimport.format_check` -delegates detection to this module. - -## Groupsets MVC (`groupsets.py`) - -`GroupSet` (data) + `GroupSetModel` (collection, bounds-safe selection, -CRUD) + `build_query_dict()` replace what used to be a bare list + -selected-index pair. `MainWindow.groupsetmodel` is the live state; -`ui_functions.py`'s `addgroup`/`removegroup`/`updatesets`/`updategroups`/ -`writegroups`/`colour_picker1` are thin view-sync controllers over it. -`main.py`'s `query` class still exists, but **only** as the unpickle target -for old `.mpct` files — `GroupSet.from_legacy`/`GroupSetModel.from_legacy_list` -convert on load. - -## Testing - -Headless unit tests live in `code/tests/` (pure-logic only — no Qt): - -``` -python -m pytest code/tests -q -``` - -Covers `filter`, `stats`, `importdependencies`, `translators`, and -`groupsets`. Add tests here for any new Qt-free logic. GUI behaviour can't -be tested headlessly — verify it by running the app. - -## Conventions - -- Never edit the generated UI files listed above. -- Plot generation goes through `MainWindow.safe_generate`, so one failing - plot doesn't abort the rest. -- `.mpct` saves are atomic (temp file + `os.replace`), with per-component - guards (`write_save`). -- `loadsession` restores each saved parameter independently — a bad/missing - field can't cascade and abort restoration of the rest. Add new analysis - parameters to **both** `enumerate_inputs` (save) and `loadsession` - (restore). -- Plot objects (`self.ftplt`, `self.kmd`, `self.spec`, ...) are created the - first time they're needed and `.reset()` afterward, via - `MainWindow._create_or_reset()` / `_generate_plots()` — never gate - create-vs-reset on a whole-session flag like `self.analysisrun`, since an - optional output can newly turn on mid-session for a dataset that didn't - have it before, and the object would never get created. -- Use `MainWindow._refresh_highlight()` (not `highlight_feature()`) to - redraw the current selection without changing it (e.g. on a tab switch). - `highlight_feature(newfeature)` is for real selection events and toggles - the highlight off if the same feature is clicked twice — calling it with - the already-selected feature re-triggers that toggle, which is a bug, not - a refresh. -- Every matplotlib `pick_event` handler must call - `plotting._is_duplicate_pick(parent, event)` first and bail if it - returns `True`. Matplotlib fires one `pick_event` per artist that - registers a hit, not one per click — a feature plotted in more than one - groupset/colour layer otherwise fires the handler twice per click. -- `importdependencies.checkdep()` should stay silent when nothing needs - installing — it runs on every launch, including every Spyder "Run File" - (which re-executes `main.py`'s top level). Only report actual - installs/failures. - -## Building the docs site locally - -``` -pip install mkdocs mkdocs-material -mkdocs serve -``` - -Then open `http://127.0.0.1:8000`. See [Hosting](#hosting-this-site) below -for deployment. - -## Hosting this site - -This site is plain static HTML generated by MkDocs — there's no backend, -database, or server-side logic, so it needs essentially no infrastructure. -**GitHub Pages is the right fit here**: it's free, MkDocs has built-in -support for deploying to it (`mkdocs gh-deploy`), and a low-traffic docs -site for a research tool doesn't need a dedicated host. A `gh-pages` -deploy workflow is included in this repo (`.github/workflows/docs.yml`) — -once GitHub Pages is enabled for this repo (Settings → Pages → Source: -`gh-pages` branch), the site builds and publishes automatically on every -push to `main` that touches `docs/` or `mkdocs.yml`. diff --git a/docs/index.md b/docs/index.md index 7d36644..da22288 100644 --- a/docs/index.md +++ b/docs/index.md @@ -11,7 +11,9 @@ plots, heatmaps, and per-feature spectral/database-match lookup. This site covers installing and running MPACT, the file formats it expects, the analysis and filtering options, and what each plot/tab -shows. If you're contributing to MPACT itself, see [Development](development.md). +shows. If you're contributing to MPACT itself, see +[`devnotes.md`](https://github.com/robertsamples/mpact/blob/main/devnotes.md) +in the repo root. ## Where to start @@ -25,8 +27,11 @@ shows. If you're contributing to MPACT itself, see [Development](development.md) This documentation is adapted from the original MPACT user guide (2022) and updated to reflect the current codebase (mid-2026), including the import/export translator framework for MZmine/MS-DIAL/ - Metaboscape peak tables, the background-threaded analysis run, and the - groupset (Plot Feature Sets) editor. Some screenshots referenced in - the original guide have not been re-captured yet — see - [Development](development.md) if you'd like to contribute updated - images. + Metaboscape peak tables, the background-threaded analysis run, the + groupset (Plot Feature Sets) editor, the multivariate ordination + rework (PCA/NMDS/PLS-DA with scores and loadings views), and the + dendrogram rework (purity coloring, view/bootstrap/label switchers). + Some screenshots referenced in the original guide have not been + re-captured yet — see + [`devnotes.md`](https://github.com/robertsamples/mpact/blob/main/devnotes.md) + if you'd like to contribute updated images. diff --git a/docs/plots/group-analysis.md b/docs/plots/group-analysis.md index f41eeb9..46eef57 100644 --- a/docs/plots/group-analysis.md +++ b/docs/plots/group-analysis.md @@ -12,14 +12,51 @@ combination of groups (top bar chart + dot matrix). ![UpSet plot](../images/upset-plot.png) *MPACT UpSet plot showing the distribution of features across sample sets.* -## Spearman Correlation Matrix +## Sample Correlation Matrix -Pairwise Spearman correlation between every group, useful for evaluating -overall metabolomic similarity at a glance. Colour scheme is configurable -in the plot options dialog. +Pairwise similarity between samples/groups, useful for evaluating overall +metabolomic similarity at a glance. Colour scheme is configurable in the +plot options dialog. -![Spearman correlation matrix](../images/spearman-matrix.png) -*MPACT Spearman correlation matrix.* +A settings bar shared with the UpSet Plot tab (the same bar holding the +"Sets"/"Sample Correlations" buttons) controls how it's drawn, and redraws +immediately on any change. These controls are greyed out while the UpSet +Plot tab is active, since they don't apply there. + +**Method** — which similarity measure to compute: + +- **Spearman** (default): rank correlation of abundance profiles, robust + to the non-normal, heavy-tailed abundance distributions typical of + LC-MS data. Mathematically ranges -1 to 1, but the heatmap scale is fixed + to 0-1 since real sample correlations cluster tightly positive in + practice — a -1-to-1 scale would compress that variation into an + unreadable sliver of the colour range. +- **Jaccard**: presence/absence similarity — based only on which features + are detected in each sample/group, ignoring how much. Useful when + detection (not relative abundance) is what you care about. Ranges 0 to 1. +- **Bray-Curtis**: abundance-weighted similarity, the standard measure in + ecology/metabolomics (same convention as the Multivariate Analysis tab's + NMDS). Ranges 0 to 1. + +**View** — which rows/columns to correlate: + +- **Biological Replicates** (default): technical replicates are averaged + together first (one row/column per sample), so the matrix reflects + biological/treatment-group similarity without technical noise. +- **Individual Injections**: no averaging — every injection is its own + row/column. +- **Biological Groups**: both technical and biological replicates are + averaged together — one row/column per treatment group, for "see only + biological groups" at a glance. + +**Use Sample/Group Names** — same nomenclature as the dendrogram's: when +checked, labels switch from the raw injection/file names to +`_b<#>_s<#>` (Individual Injections view), `_b<#>` +(Biological Replicates view), or the bare group name (Biological Groups +view, nothing left to shorten). + +![Sample correlation matrix](../images/spearman-matrix.png) +*MPACT sample correlation matrix.* ## Dendrogram @@ -31,12 +68,53 @@ For datasets where biological/treatment differences should exceed technical noise, technical replicates should cluster together after filtering. -Bootstrap analysis (1000 iterations) can be enabled in the plot options -dialog to annotate the dendrogram with approximately-unbiased (AU) p-values -and bootstrap probabilities (BP). AU values above 95 are generally -considered statistically significant. Bootstrap computation uses -`fastcluster` if it's installed (falling back to SciPy's hierarchical -clustering otherwise) for substantially faster linkage on large datasets. +A settings bar above the plot controls how it's drawn — all four options +are local to this tab (not the general plot options dialog) and redraw +immediately when changed: + +**View** — which leaves to cluster: + +- **Technical Replicates** (default): every injection is its own leaf, + letting you see whether each sample's individual injections agree with + each other. +- **Biological Replicates**: technical replicates are averaged together + first (one leaf per sample), so the plot reflects clustering of + biological/treatment groups without technical noise. + +**Color** — how branches are colored: + +- **Purity** (default): a branch is colored **green** if every leaf beneath + it belongs to the same sample (Technical Replicates view) or the same + treatment group (Biological Replicates view) — i.e. it's correctly, + unambiguously clustered. A branch is colored **magenta** if it's the + specific point where two different samples/groups' leaves are proven to + overlap (some of that sample's/group's replicates are on each side of the + split) — a real sign of poor clustering, not just "still mixed from + somewhere lower in the tree." (Magenta rather than the more conventional + red, since red-green colorblindness — the most common form — can't tell + red and green apart; magenta stays distinguishable from green.) Every + other branch (a clean join of two unrelated, already-resolved regions) + stays black, even if it sits above a magenta branch elsewhere in the tree + — so a single tangled sample doesn't paint the whole tree magenta. The + plot title reports how many samples/groups are *fully* correctly + clustered (e.g. "7/9 samples' replicates clustered together"). +- **None**: a plain, uncolored dendrogram with no title — useful if you + just want the clustering shape without the QC overlay. + +**Bootstrap** — when checked (default on), runs bootstrap resampling +(1000 iterations) and annotates the dendrogram with approximately-unbiased +(AU) p-values and bootstrap probabilities (BP) at each branch point. AU +values above 95 are generally considered statistically significant. +Bootstrap computation uses `fastcluster` if it's installed (falling back to +SciPy's hierarchical clustering otherwise) for substantially faster linkage +on large datasets. Uncheck for a faster, unannotated dendrogram. + +**Use Sample/Group Names** — when checked, leaf labels switch from the raw +injection/file names (which can be long or uninformative) to +`_b<#>_s<#>` (Technical Replicates view) or `_b<#>` +(Biological Replicates view), where `b<#>` numbers each biological +replicate (sample) within its group and `s<#>` numbers each technical +replicate (injection) within its sample. ![Dendrogram](../images/dendrogram.png) *MPACT dendrogram after filtering, showing correct clustering of most diff --git a/docs/plots/multivariate.md b/docs/plots/multivariate.md index 30aa180..013e37f 100644 --- a/docs/plots/multivariate.md +++ b/docs/plots/multivariate.md @@ -1,17 +1,46 @@ -# Multivariate Analysis (NMDS) +# Multivariate Analysis -Nonmetric multidimensional scaling (NMDS) reduces the high-dimensional -metabolomics dataset to a low-dimensional view that's easy to interpret -visually. Samples are plotted by biological group, with 95% confidence -ellipses shown per group — samples that are closer together are more -similar in overall metabolome. +Reduces the high-dimensional metabolomics dataset to a low-dimensional +view. Three ordination methods are available: **NMDS**, **PCA**, and +**PLS-DA** — all operating on the same samples × features intensity +matrix. -By default, technical replicates are averaged so NMDS runs at the -sample level. To instead evaluate clustering of individual technical -replicates, uncheck replicate averaging in the plot options dialog -(the small square button in the plot's toolbar). +A settings bar above the plot controls how it's drawn and redraw +immediately on any change: + +**Method** — which ordination to run: + +- **NMDS** (default): nonmetric multidimensional scaling using + Bray-Curtis dissimilarity. A rank-based embedding — samples that are + closer together are more similar in overall metabolome. The plot + title shows the NMDS stress (the conventional fit-quality metric); + axis labels are NMDS1/NMDS2 (no percent-explained, since NMDS is + not a linear decomposition of the feature space). +- **PCA**: principal component analysis on mean-centered, + unit-variance-scaled features. Axis labels show percent of + total variance explained by each component. +- **PLS-DA**: partial least-squares discriminant analysis, supervised + by biological group. Axis labels show percent of explained variance + per component. Useful when NMDS/PCA show overlapping groups but + there's a genuine biological difference you want to maximize the + separation of. + +**View** — which aspect of the ordination to plot: + +- **Scores** (default): each sample (or injection) as a point, + coloured by biological group, with 95% confidence ellipses per + group. +- **Loadings**: the top 25 features that most drive the separation, + shown as arrows from the origin. Selecting a feature in another + plot highlights it here (yellow marker at its loadings position). + +**Collapse Replicates** — when checked (default on), technical +replicates are averaged together before running ordination, so each +point represents one biological sample. Uncheck to treat every +individual injection as its own point, which can be useful for +diagnosing injection-level outliers. ![NMDS plot](../images/nmds-plot.png) -*MPACT NMDS plot with technical-replicate averaging, showing differences -between samples and biological groups, with shaded ovals denoting 95% -confidence intervals.* +*MPACT multivariate ordination (NMDS scores view) with technical-replicate +averaging, showing differences between samples and biological groups, +with shaded ovals denoting 95% confidence intervals.* diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 8482fd4..d2cdef7 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -90,4 +90,6 @@ python -m pytest code/tests -q ``` GUI behaviour itself can't be tested headlessly and needs to be checked by -running the app — see [Development](development.md). +running the app — see +[`devnotes.md`](https://github.com/robertsamples/mpact/blob/main/devnotes.md) +in the repo root. diff --git a/docs/user-guide/analysis-settings.md b/docs/user-guide/analysis-settings.md index 1500af0..f7ba592 100644 --- a/docs/user-guide/analysis-settings.md +++ b/docs/user-guide/analysis-settings.md @@ -59,6 +59,6 @@ presence/absence in these groups, as indicated by Venn diagrams.* Internally, each Plot Feature Set is a `GroupSet` object managed by a small model/collection class (`GroupSetModel`) rather than a bare list + selected-index pair. This is purely an implementation detail (see - [Development](../development.md)) — old `.mpct` save files still load - correctly, with their saved feature sets converted into the current - representation automatically. + [`devnotes.md`](https://github.com/robertsamples/mpact/blob/main/devnotes.md)) + — old `.mpct` save files still load correctly, with their saved feature + sets converted into the current representation automatically. diff --git a/mkdocs.yml b/mkdocs.yml index 3900d26..5b76a22 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,5 +59,4 @@ nav: - Heatmap: plots/heatmap.md - Feature Info: feature-info.md - Troubleshooting: troubleshooting.md - - Development: development.md - Changelog: changelog.md