robertsamples · robertsamples · Jun 30, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -25,7 +25,7 @@ jobs:
       # against a few system libraries the base image doesn't ship.
       - if: runner.os == 'Linux'
         run: sudo apt-get update && sudo apt-get install -y libgl1 libxkbcommon-x11-0 libxcb-cursor0
-      - run: pip install "numpy<2" pandas scipy tqdm pytest PyQt5
+      - run: pip install "numpy<2" pandas scipy scikit-learn tqdm pytest PyQt5
       - run: python -m pytest code/tests -v
 
   lint:

diff --git a/code/clusterpurity.py b/code/clusterpurity.py
@@ -0,0 +1,108 @@
+"""
+MPACT
+Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas
+
+Qt-free dendrogram "purity" coloring: a branch is colored green if every
+leaf beneath it shares the same group label -- i.e. that group is a
+monophyletic clade, it clustered together before merging with anything
+else -- and left at the default color otherwise. Used by the dendrogram tab
+to make it visually obvious whether technical replicates of one Sample
+cluster tightly together, and separately whether biological replicates of
+one Biolgroup are well separated from other groups.
+
+Default colors are green/magenta rather than the more conventional
+green/red -- red-green colorblindness (the most common form) makes the two
+indistinguishable; magenta stays distinguishable from green under all
+common forms of color vision deficiency.
+
+This module is Qt-free and unit-tested (see ``tests/test_clusterpurity.py``).
+"""
+
+
+def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='magenta', neutral_color='black'):
+    """Build a ``link_color_func`` for ``scipy.cluster.hierarchy.dendrogram``.
+
+    Three-way coloring, classified by comparing the two children's label
+    sets (not by simply asking "is the merge result impure", which would
+    paint every ancestor of a single mixing event false_color all the way to
+    the root):
+
+    - ``true_color`` ("monophyletic"): the two children's label sets are
+      identical and contain exactly one label -- every leaf under this link
+      shares one label.
+    - ``false_color`` ("polyphyletic"): the two children's label sets
+      *overlap* (share at least one label) without being identical-and-
+      singleton -- this is definitive proof that some label's leaves are
+      split apart by this exact merge (some of that label is on each side),
+      i.e. genuinely non-monophyletic, not just "still impure from before".
+    - ``neutral_color``: the two children's label sets are *disjoint* (no
+      label in common) -- this merge simply joins two regions that don't
+      contradict each other; it's a clean bridge even if one or both
+      children are themselves impure from a *different* label's tangle
+      further down. This is what keeps a single low-level tangle from
+      cascading false_color all the way up the tree: once a tangled label's
+      clade stops growing (nothing more of that label to fold in), every
+      merge above it only ever joins disjoint regions, so it reverts to
+      ``neutral_color``.
+
+    Args:
+        Z: linkage matrix (``scipy.cluster.hierarchy.linkage`` or
+            fastcluster's drop-in) built on observations in the same order
+            as ``leaf_labels``.
+        leaf_labels: sequence, length == number of observations clustered by
+            ``Z``, giving each leaf's group label (e.g. its Sample or
+            Biolgroup), in the same order as the data passed to ``linkage``.
+
+    Returns:
+        callable: ``link_color_func(k)`` as expected by ``dendrogram``'s
+        ``link_color_func`` argument.
+    """
+    n_leaves = len(leaf_labels)
+    leaf_label_sets = {i: {leaf_labels[i]} for i in range(n_leaves)}
+    colors = {}
+    for i, row in enumerate(Z):
+        a, b = int(row[0]), int(row[1])
+        node_id = n_leaves + i
+        set_a, set_b = leaf_label_sets[a], leaf_label_sets[b]
+        merged = set_a | set_b
+        leaf_label_sets[node_id] = merged
+        if len(merged) == 1:
+            colors[node_id] = true_color
+        elif set_a.isdisjoint(set_b):
+            colors[node_id] = neutral_color
+        else:
+            colors[node_id] = false_color
+    return lambda k: colors.get(k, neutral_color)
+
+
+def purity_summary(Z, leaf_labels):
+    """Count how many distinct group labels form one pure clade each.
+
+    A label is "pure" only if *every* leaf carrying that label ends up
+    together in one clade before that clade merges with any other leaf --
+    i.e. the group is exactly monophyletic in the dendrogram. (A node whose
+    descendants are a uniform-but-incomplete subset of a label -- e.g. 2 of
+    a Sample's 3 technical replicates -- does NOT count: the third
+    replicate clustering elsewhere means that Sample isn't really pure.)
+
+    Returns:
+        (n_pure, n_total): number of distinct labels that are fully pure
+        clades, out of the total number of distinct labels in
+        ``leaf_labels``.
+    """
+    n_leaves = len(leaf_labels)
+    leaf_index_sets = {i: frozenset((i,)) for i in range(n_leaves)}
+    target_sets = {
+        label: frozenset(i for i in range(n_leaves) if leaf_labels[i] == label)
+        for label in set(leaf_labels)
+    }
+    pure_labels = set()
+    for i, row in enumerate(Z):
+        a, b = int(row[0]), int(row[1])
+        node_id = n_leaves + i
+        merged = leaf_index_sets[a] | leaf_index_sets[b]
+        leaf_index_sets[node_id] = merged
+        for label, target in target_sets.items():
+            if merged == target:
+                pure_labels.add(label)
+    return len(pure_labels), len(target_sets)
diff --git a/code/groupsets.py b/code/groupsets.py
@@ -122,6 +122,34 @@ def remove(self, index=None):
             del self._items[index]
         self.select(self._selected)
 
+    def move(self, from_index, to_index):
+        """Reorder the groupset at ``from_index`` to ``to_index``.
+
+        Both indices are clamped to the valid range; out-of-range or equal
+        indices are a no-op. Selection follows the moved item, so the
+        groupset that was selected before the move is still selected after
+        (by identity, not by index) -- a drag-and-drop reorder shouldn't
+        change which groupset is being edited.
+        """
+        if not self._items:
+            return
+        from_index = max(0, min(from_index, len(self._items) - 1))
+        to_index = max(0, min(to_index, len(self._items) - 1))
+        if from_index == to_index:
+            return
+        selected_item = self.selected
+        groupset = self._items.pop(from_index)
+        self._items.insert(to_index, groupset)
+        if selected_item is not None:
+            # Identity, not '==' -- GroupSet.__eq__ is value-based, and two
+            # distinct groupsets can compare equal (e.g. freshly added ones
+            # before either is edited), so list.index() could pick the wrong
+            # one.
+            for i, item in enumerate(self._items):
+                if item is selected_item:
+                    self._selected = i
+                    break
+
     def update(self, index, *, name=None, src=None, incl=None, excl=None, colour=None):
         """Overwrite the given fields of the groupset at ``index``."""
         groupset = self._items[index]

diff --git a/code/main.py b/code/main.py
@@ -13,11 +13,10 @@
 import time
 import string
 
-import platform
 from PyQt5 import QtCore, QtWidgets
-from PyQt5.QtWidgets import QMainWindow, QSizeGrip, QGraphicsDropShadowEffect, QFileDialog, QListWidgetItem, QColorDialog
-from PyQt5.QtCore import (QCoreApplication, QPropertyAnimation, QDate, QDateTime, QMetaObject, QObject, QPoint, QRect, QSize, QTime, QUrl, Qt, QEvent)
-from PyQt5.QtGui import QBrush, QColor, QIcon, QPalette, QPainter, QPixmap
+from PyQt5.QtWidgets import QMainWindow, QSizeGrip
+from PyQt5.QtCore import QObject, Qt
+from PyQt5.QtGui import QPixmap
 from pathlib import Path
 
 # Install/verify non-stock dependencies (epam.indigo, UpSetPlot, squarify)
@@ -35,14 +34,14 @@
 import files
 
 from MSFaST import run_MSFaST, analysis_parameters
-from groupsets import GroupSet, GroupSetModel, build_query_dict
+from groupsets import GroupSetModel, build_query_dict
 from plotslots import PlotSlotRegistry
 from paramfields import save_checkbox_fields
 from csvcache import cached_read_csv, invalidate as invalidate_csv_cache
 from biogroups import compute_biological_groups
 from dbsearch import search_npatlas
 from searchtree import SearchTreePanel
-from plotting import plot_abund, show_spectrum, show_featureplt, plot_heatmap, plot_mzrt, plot_samplecorr, kendrick, plot_volcano, plot_fc3d, plot_dendrogram, plot_PCA, prev_cv, gen_upsetplt, gen_treemap
+from plotting import plot_abund, show_spectrum, show_featureplt, plot_heatmap, plot_mzrt, plot_samplecorr, kendrick, plot_volcano, plot_fc3d, plot_dendrogram, plot_ordination, prev_cv, plot_upset, plot_treemap
 import getfragdb
 
 from indigo import Indigo
@@ -71,27 +70,31 @@
 -add bypass for plots based on checkmark. possibly use if check: ... else: button.hide() then pass
 
 - distribution of CVs on bottom of cvplt?
-- add pca option and allow visualization of key features on multivar plt?
 
 #TODO#
-- in source spectra viewer in tab plot
-- do overall data quality score, AUC
+- in source spectra viewer in spectrum details tab plot with preexisting in source fragment deconvolution algoirthm
+- clean up import sections and general code for better maintability and good syntax/standards
+    ~main.py's own import section done (dead PyQt5/stdlib/groupsets imports removed,
+    verified unused via pyflakes + grep, no behavior change); other files not yet swept
+- do overall data quality score, AUC on CV plot or something, may be present in a different form already
 - standardize method and class names
-- database management, options
-- fix up analysisinfo file output
-
-- mzmine msp file import
-- add other ordination options
+- add terminal output with current line to status bar instead of just static status messages, perhaps with expand button to show full terminal output
+- potentially consider other database options like HMDB etc
+- fix up analysisinfo file output with better and more useful log ingo
+- add other ordination options like pca, pls-da, etc etc
 - add custom keyword arguments for each plot to make calling them easier
-- add runcheck before searching when switching to search tab
-- Figure out way to have only active plot be updated and then to update others
-    when plot is switched
-- make it so groups can be reordered
+- make it so groups can be reordered in the groupsets widgets?
+    ~model-layer support done: GroupSetModel.move() (groupsets.py), tested in
+    test_groupsets.py. UI drag-drop wiring (listWidget_pltgrps InternalMove +
+    syncing its rowsMoved signal to model.move()) not done -- needs a live
+    GUI session to verify the selection-tracking interacts correctly with
+    updatesets()'s existing blockSignals dance, which isn't something to
+    guess at unverified
 - consider if indexing and feature highly functions in plot options have any easy wins for optimization or disk use. (prob not)
 - make goto buttons just one class and lambda an index for the stacked widgets
     when connecting!
 
-
+likely items that need more thought and planning
 - maybe have a comparison mode for many different strains with and without elicitor
 - specificity/sensitivity plot
 - other statistical models
@@ -238,7 +241,15 @@ def __init__(self):
 
         self.ui.setupUi(self)
         self.ui.label_credits.setText('v1.00.01 r26.06.29')
-
+
+        # "PCA" was a misnomer left over from when this checkbox/button only
+        # ran NMDS (see plotting.plot_ordination) -- the underlying
+        # checkBox_pca objectName/analysis_params.PCA attribute stay
+        # unchanged for .mpct save-file compatibility; only the visible text
+        # and tooltip change.
+        self.ui.checkBox_pca.setText('Multivariate')
+        self.ui.btn_pca.setToolTip('Multivariate Ordination (PCA/NMDS/PLS-DA)')
+
         #initialize other dialog windows
         self.dialog = dialog()
         self.ftrdialog = ftrdialog()
@@ -759,6 +770,13 @@ def _refresh_highlight(self):
             )
             self.canvas['kmd'].draw_idle()
 
+        # Update the multivariate plot's loadings-view highlight (a separate
+        # concept from its scores view, which highlights a clicked *sample*
+        # via parent.pickedsample, not a feature -- so this only applies
+        # when self.pca exists and is currently showing loadings).
+        if getattr(self, 'pca', None) is not None:
+            self.pca.highlight_loading(self.pickedfeature, self.highlightcol)
+
         # Update feature plot with the selected feature
         self.highlight['featureplt'].set_data(
             [iondict.loc[self.pickedfeature, 'Retention time (min)']],
@@ -1038,6 +1056,11 @@ def _generate_plots(self):
         dfs = self.filtereddfs
         grpsts = self.groupsets
 
+        self._create_or_reset('treemap', 'treemap',
+            lambda: plot_treemap(self, 'treemap', self.ui.frame_treemap, pltfile, '', ''),
+            lambda: self.treemap.reset(pltfile, '', ''))
+        stop_functime('treemap complete')
+
         if params.CVfil:
             self._create_or_reset('prevcv', 'CV plot',
                 lambda: prev_cv(self, 'cvplt', self.ui.frame_cvplt, 'none', 'none', 'none'),
@@ -1055,10 +1078,10 @@ def _generate_plots(self):
         stop_functime('dendrogram complete')
 
         if params.PCA:
-            self._create_or_reset('pca', 'PCA/NMDS plot',
-                lambda: plot_PCA(self, 'pca', self.ui.frame_pca, pltfile, '', ''),
+            self._create_or_reset('pca', 'multivariate ordination plot',
+                lambda: plot_ordination(self, 'pca', self.ui.frame_pca, pltfile, '', ''),
                 lambda: self.pca.reset(pltfile, '', ''))
-            stop_functime('nmds complete')
+            stop_functime('ordination complete')
 
         if params.FC3Dplt:
             self._create_or_reset('fc3d', '3D fold-change plot',
@@ -1101,6 +1124,11 @@ def _generate_plots(self):
             lambda: self.samplecorr.reset(iondictfile, dfs, grpsts))
         stop_functime('samplecorr complete')
 
+        self._create_or_reset('upset', 'upset plot',
+            lambda: plot_upset(self, 'upset', self.ui.frame_upset, iondictfile, '', ''),
+            lambda: self.upset.reset(iondictfile, '', ''))
+        stop_functime('upsetplt complete')
+
     def run_analysis(self):
         # Ignore re-clicks while an analysis is already running on the worker thread.
         if getattr(self, '_analysis_thread', None) is not None and self._analysis_thread.isRunning():
@@ -1153,12 +1181,6 @@ def _on_compute_finished(self):
             self.ui.btn_run.setEnabled(True)
 
     def _finish_analysis(self):
-        try:
-            gen_treemap(self)  # move back to end
-        except Exception:
-            print("not generating tremap due to an error")
-        stop_functime('treemap complete')
-
         # Used for point opacity based on abundance colouring
         iondict = cached_read_csv(self.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=[0], index_col=None)
         self.analysis_paramsgui.maxval = iondict['logmax'].max()
@@ -1206,11 +1228,6 @@ def _finish_analysis(self):
             self.fillfttree()
             self.dbsearchdone = True
 
-        try:
-            gen_upsetplt(self)
-        except Exception:
-            print("not generating upset plot due to an error")
-        stop_functime('upsetplt complete')
         self.ui.label_status.setText('Analysis Complete')
         stop_functime('analysis complete')
         print('')