From 69c86be0b2aacd6719946470c91ef03cad2047ef Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 23:02:56 +0000 Subject: [PATCH 01/10] Render FLASHDeconv and FLASHQuant via OpenMS-Insight components Stage B (FLASHDeconv) + Stage D (FLASHQuant) of the migration off the bespoke flash_viewer_grid onto the reusable OpenMS-Insight library. Parse layer (src/parse/deconv.py): add long-format producers (deconv_spectrum_long, anno_spectrum_long, combined_spectrum_long with is_signal, mass_table_long with index+mass_id) alongside the existing array-format frames (additive, legacy path untouched), so components can filter by column value instead of row index. FLASHDeconv: new FLASHDeconvViewerOI builds the scan/mass tables, deconv/ annotated/combined LinePlots, Scatter3D precursor-signal plot, 4 heatmaps, DensityPlot and SequenceView, wired through one StateManager per experiment panel (distinct session_key => side-by-side selection isolation) with full [experiment][row][col] layout parity. Routed behind settings flag use_openms_insight_viewer (default true); legacy render_grid retained. FLASHQuant: render the feature-group view via FeatureView (standalone), no data transformation required. requirements: add openms-insight>=0.1.11. --- content/FLASHDeconv/FLASHDeconvViewer.py | 57 +++- content/FLASHDeconv/FLASHDeconvViewerOI.py | 347 +++++++++++++++++++++ content/FLASHQuant/FLASHQuantViewer.py | 44 ++- requirements.txt | 3 + settings.json | 1 + src/parse/deconv.py | 198 ++++++++++++ tests/test_deconv_long_format.py | 161 ++++++++++ 7 files changed, 781 insertions(+), 30 deletions(-) create mode 100644 content/FLASHDeconv/FLASHDeconvViewerOI.py create mode 100644 tests/test_deconv_long_format.py diff --git a/content/FLASHDeconv/FLASHDeconvViewer.py b/content/FLASHDeconv/FLASHDeconvViewer.py index 4097e32d..36ff9c5e 100644 --- a/content/FLASHDeconv/FLASHDeconvViewer.py +++ b/content/FLASHDeconv/FLASHDeconvViewer.py @@ -4,7 +4,34 @@ from src.common.common import page_setup, save_params from src.workflow.FileManager import FileManager +# Legacy bespoke-grid render path (kept importable until OI integration is verified). from src.render.render import render_grid +# New OpenMS-Insight viewer (Stage B). Selected via the +# `use_openms_insight_viewer` settings flag (defaults True). +from content.FLASHDeconv.FLASHDeconvViewerOI import render_experiment_panel + + +def _use_oi_viewer(): + return st.session_state.get("settings", {}).get( + "use_openms_insight_viewer", True + ) + + +def render_panel(experiment_id, layout_info_per_exp, file_manager, identifier, + grid_key, panel_index): + """Render one experiment panel via the configured viewer. + + Routes to the new OpenMS-Insight viewer when enabled, else the legacy grid. + """ + if _use_oi_viewer(): + render_experiment_panel( + experiment_id, layout_info_per_exp, file_manager, panel_index + ) + else: + render_grid( + experiment_id, layout_info_per_exp, file_manager, + 'flashdeconv', identifier, grid_key + ) DEFAULT_LAYOUT = [['ms1_deconv_heat_map'], ['scan_table', 'mass_table'], ['anno_spectrum', 'deconv_spectrum'], ['3D_SN_plot']] @@ -84,9 +111,9 @@ def get_sequence(): on_change=select_experiment ) if 'selected_experiment0' in st.session_state: - render_grid( - st.session_state.selected_experiment0, layout[0], file_manager, - 'flashdeconv', "selected_experiment0", 'flash_viewer_grid_0' + render_panel( + st.session_state.selected_experiment0, layout[0], file_manager, + "selected_experiment0", 'flash_viewer_grid_0', panel_index=0 ) with c2: st.selectbox( @@ -97,10 +124,10 @@ def get_sequence(): ) if f"selected_experiment1" in st.session_state: with st.spinner('Loading component...'): - render_grid( - st.session_state["selected_experiment1"], layout[1], - file_manager, 'flashdeconv', 'selected_experiment1', - 'flash_viewer_grid_1' + render_panel( + st.session_state["selected_experiment1"], layout[1], + file_manager, 'selected_experiment1', + 'flash_viewer_grid_1', panel_index=1 ) else: @@ -114,9 +141,9 @@ def get_sequence(): if 'selected_experiment0' in st.session_state: - render_grid( - st.session_state.selected_experiment0, layout[0], file_manager, - 'flashdeconv', 'selected_experiment0' + render_panel( + st.session_state.selected_experiment0, layout[0], file_manager, + 'selected_experiment0', 'flash_viewer_grid', panel_index=0 ) ### for multiple experiments on one view @@ -135,11 +162,11 @@ def get_sequence(): ) # if #experiment input files are less than #layouts, all the pre-selection will be the first experiment if f"selected_experiment{exp_index}" in st.session_state: - render_grid( - st.session_state["selected_experiment%d" % exp_index], - layout[exp_index], file_manager, 'flashdeconv', - "selected_experiment%d" % exp_index, - 'flash_viewer_grid_%d' % exp_index + render_panel( + st.session_state["selected_experiment%d" % exp_index], + layout[exp_index], file_manager, + "selected_experiment%d" % exp_index, + 'flash_viewer_grid_%d' % exp_index, panel_index=exp_index ) save_params(params) diff --git a/content/FLASHDeconv/FLASHDeconvViewerOI.py b/content/FLASHDeconv/FLASHDeconvViewerOI.py new file mode 100644 index 00000000..5be82392 --- /dev/null +++ b/content/FLASHDeconv/FLASHDeconvViewerOI.py @@ -0,0 +1,347 @@ +"""FLASHDeconv viewer rendered entirely with OpenMS-Insight components (Stage B). + +This is the NEW viewer for the FLASHApp -> OpenMS-Insight visualization migration. +It renders the FLASHDeconv workflow using the reusable ``openms_insight`` component +library (``Table``, ``LinePlot``, ``Heatmap``, ``Scatter3D``, ``DensityPlot``, +``SequenceView``) instead of the bespoke ``flash_viewer_grid`` Vue grid in +``src/render/*``. + +Design goals (see ``/home/user/parity/STRATEGY.md`` §4/§5): + +* ONE shared ``StateManager`` per rendered experiment panel, keyed by a DISTINCT + ``session_key`` (``svc_state_deconv_``) so that selections never + leak between side-by-side experiment panels (HARD edge #6). +* Layout parity: the ``[experiment][row][col]`` nested grid is reproduced with + ``st.columns`` per row (<=3 cols), rows stacked; multi-experiment side-by-side + uses a top-level ``st.columns`` (<=5 panels). +* The component->frame->filters/interactivity wiring exactly mirrors the schema + from the long-format parse producers in ``src/parse/deconv.py``. + +The OLD render path (``src/render/render.py`` / ``flash_viewer_grid``) is left +intact and importable; the page chooses which path to use. + +NOTE ON CACHES: every OpenMS-Insight component persists a preprocessed cache under +``{cache_path}/{cache_id}/``. We derive a per-experiment cache directory inside the +workspace so the caches live next to the FLASHApp parquet cache and are stable +across reruns. ``cache_id`` is suffixed with the experiment id to keep experiments +isolated on disk as well as in session state. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List, Optional + +import polars as pl +import streamlit as st + +from openms_insight import ( + DensityPlot, + Heatmap, + LinePlot, + Scatter3D, + SequenceView, + StateManager, + Table, +) + +# Map the layout COMPONENT_NAMES (FLASHDeconvLayoutManager) to a builder. Every +# builder returns a *callable* OpenMS-Insight component already wired with the +# shared filters/interactivity identifiers. The identifiers below are the FLASHApp +# StateTracker keys (scanIndex / massIndex / heatmap zoom ids) so that state flows +# across components exactly like the legacy grid. + +SCAN_KEY = "scanIndex" +MASS_KEY = "massIndex" + + +def _component_cache_dir(file_manager, experiment_id: str) -> str: + """Directory under the workspace cache where OI component caches are written.""" + cache_root = Path(file_manager.cache_path, "oi_components", str(experiment_id)) + cache_root.mkdir(parents=True, exist_ok=True) + return str(cache_root) + + +def _data_path(file_manager, experiment_id: str, name_tag: str) -> Optional[str]: + """Resolve the on-disk parquet path for a stored frame, or None if absent.""" + if not file_manager.result_exists(experiment_id, name_tag): + return None + res = file_manager.get_results(experiment_id, [name_tag], partial=True) + path = res.get(name_tag) + return str(path) if path is not None else None + + +def _lazy(file_manager, experiment_id: str, name_tag: str) -> Optional[pl.LazyFrame]: + """Load a stored frame as a polars LazyFrame, or None if absent.""" + if not file_manager.result_exists(experiment_id, name_tag): + return None + return file_manager.get_results( + experiment_id, [name_tag], use_polars=True + )[name_tag] + + +# --------------------------------------------------------------------------- +# Per-component builders. Each returns an OpenMS-Insight component instance, or +# None when the underlying data frame is missing (component is silently skipped). +# --------------------------------------------------------------------------- + +def _build_heatmap( + file_manager, experiment_id: str, cache_dir: str, frame_tag: str, + zoom_id: str, title: str, +): + data = _lazy(file_manager, experiment_id, frame_tag) + if data is None: + return None + # Long heatmap frames carry columns: mass, rt, intensity, scan_idx, mass_idx. + # Axes per Heatmap.md: x = Retention Time (rt), y = Mass (mass). + return Heatmap( + cache_id=f"{frame_tag}_{experiment_id}", + data=data, + x_column="rt", + y_column="mass", + intensity_column="intensity", + zoom_identifier=zoom_id, + title=title, + x_label="Retention Time", + y_label="Mass", + cache_path=cache_dir, + ) + + +def _build_scan_table(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "scan_table") + if data is None: + return None + # Scan table: clicking a row sets scanIndex to the row's `index`. + return Table( + cache_id=f"scan_table_{experiment_id}", + data=data, + interactivity={SCAN_KEY: "index"}, + index_field="index", + title="Scan Table", + cache_path=cache_dir, + ) + + +def _build_mass_table(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "mass_table_long") + if data is None: + return None + # Mass table (long): filtered to the selected scan via `index`; clicking a row + # sets massIndex to the row's `mass_id`. + return Table( + cache_id=f"mass_table_{experiment_id}", + data=data, + filters={SCAN_KEY: "index"}, + interactivity={MASS_KEY: "mass_id"}, + index_field="mass_id", + title="Mass Table", + cache_path=cache_dir, + ) + + +def _build_deconv_spectrum(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "deconv_spectrum_long") + if data is None: + return None + # Deconvolved spectrum: filtered by scan; clicking a peak sets massIndex. + return LinePlot( + cache_id=f"deconv_spectrum_{experiment_id}", + data=data, + filters={SCAN_KEY: "index"}, + interactivity={MASS_KEY: "peak_id"}, + x_column="MonoMass", + y_column="SumIntensity", + title="Deconvolved Spectrum", + x_label="Monoisotopic Mass", + y_label="Intensity", + cache_path=cache_dir, + ) + + +def _build_anno_spectrum(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "anno_spectrum_long") + if data is None: + return None + # Annotated/raw spectrum: filtered by scan; consumer only (no interactivity). + return LinePlot( + cache_id=f"anno_spectrum_{experiment_id}", + data=data, + filters={SCAN_KEY: "index"}, + x_column="MonoMass_Anno", + y_column="SumIntensity_Anno", + title="Annotated Spectrum", + x_label="m/z", + y_label="Intensity", + cache_path=cache_dir, + ) + + +def _build_combined_spectrum(file_manager, experiment_id: str, cache_dir: str): + primary = _lazy(file_manager, experiment_id, "combined_spectrum_long") + if primary is None: + return None + anno = _lazy(file_manager, experiment_id, "anno_spectrum_long") + # Augmented/combined: primary deconv series + signal-peak markers, with the + # annotated overlay supplied as the second series. The LinePlot Vue reads the + # x2/y2 columns as INDEPENDENT column arrays (their own length), NOT row-aligned + # with the primary series. Because the deconv peak axis and the anno peak axis + # have different per-scan lengths, we must VERTICALLY STACK the two long frames + # (diagonal concat) rather than relationally join them (a join would multiply + # rows cartesian-style). After the scanIndex value-filter on `index`, the + # primary columns are populated on the deconv rows and the anno columns on the + # anno rows; each column array is then the correct length for its series. + if anno is not None: + primary = pl.concat([primary, anno], how="diagonal") + x2, y2 = "MonoMass_Anno", "SumIntensity_Anno" + else: + x2 = y2 = None + return LinePlot( + cache_id=f"combined_spectrum_{experiment_id}", + data=primary, + filters={SCAN_KEY: "index"}, + interactivity={MASS_KEY: "peak_id"}, + x_column="MonoMass", + y_column="SumIntensity", + signal_peak_column="is_signal", + x2_column=x2, + y2_column=y2, + title="Augmented Deconvolved Spectrum", + x_label="Monoisotopic Mass", + y_label="Intensity", + cache_path=cache_dir, + ) + + +def _build_scatter3d(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "threedim_SN_plot") + if data is None: + return None + # 3D S/N plot: scanIndex value-filters on `index`; massIndex handled internally + # as an array subscript (NOT a value filter). + return Scatter3D( + cache_id=f"threedim_SN_plot_{experiment_id}", + data=data, + scan_filter="index", + signal_column="SignalPeaks", + noisy_column="NoisyPeaks", + title="Precursor Signals", + cache_path=cache_dir, + ) + + +def _build_fdr_plot(file_manager, experiment_id: str, cache_dir: str): + # Precomputed {x,y} density frames stored by deconv.py. The TnT/Deconv literals + # (axis "QScore", series "Target/Decoy QScores") are the DensityPlot defaults. + target = _lazy(file_manager, experiment_id, "density_target") + decoy = _lazy(file_manager, experiment_id, "density_decoy") + if target is None and decoy is None: + return None + return DensityPlot( + cache_id=f"fdr_plot_{experiment_id}", + density_target=target, + density_decoy=decoy, + title="Score Distribution", + cache_path=cache_dir, + ) + + +def _get_sequence(file_manager): + """Return the submitted (sequence, fix_C, fix_M) tuple, or None.""" + if not file_manager.result_exists("sequence", "sequence"): + return None + sequence = file_manager.get_results("sequence", "sequence")["sequence"] + return ( + sequence["input_sequence"], + sequence["fixed_mod_cysteine"], + sequence["fixed_mod_methionine"], + ) + + +def _build_sequence_view(file_manager, experiment_id: str, cache_dir: str): + seq = _get_sequence(file_manager) + if seq is None: + return None + sequence_string, _fix_c, _fix_m = seq + # Deconv peaks are neutral masses (deconvolved=True). Wire the deconv long + # spectrum as the peaks_data (renamed to the SequenceView schema: peak_id, + # mass, intensity), filtered by the selected scan. C/M fixed mods are computed + # from the sequence (compute_fixed_mods=True) for Deconv parity. + peaks = _lazy(file_manager, experiment_id, "deconv_spectrum_long") + if peaks is None: + return None + peaks = peaks.select( + pl.col("index"), + pl.col("peak_id"), + pl.col("MonoMass").alias("mass"), + pl.col("SumIntensity").alias("intensity"), + ) + return SequenceView( + cache_id=f"sequence_view_{experiment_id}", + sequence_data=sequence_string, + peaks_data=peaks, + filters={SCAN_KEY: "index"}, + interactivity={MASS_KEY: "peak_id"}, + deconvolved=True, + compute_fixed_mods=True, + title="Sequence View", + cache_path=cache_dir, + ) + + +# COMPONENT_NAMES (layout) -> builder. Mirrors FLASHDeconvLayoutManager COMPONENT_NAMES. +COMPONENT_BUILDERS = { + "ms1_raw_heatmap": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms1_raw_heatmap", "heatmap_raw", "Raw MS1 Heatmap"), + "ms2_raw_heatmap": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms2_raw_heatmap", "heatmap_raw2", "Raw MS2 Heatmap"), + "ms1_deconv_heat_map": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms1_deconv_heatmap", "heatmap_deconv", "Deconvolved MS1 Heatmap"), + "ms2_deconv_heat_map": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms2_deconv_heatmap", "heatmap_deconv2", "Deconvolved MS2 Heatmap"), + "scan_table": _build_scan_table, + "deconv_spectrum": _build_deconv_spectrum, + "anno_spectrum": _build_anno_spectrum, + "mass_table": _build_mass_table, + "3D_SN_plot": _build_scatter3d, + "fdr_plot": _build_fdr_plot, + "sequence_view": _build_sequence_view, + # internal_fragment_map: deferred (component disabled in the legacy path too). +} + + +def build_component(file_manager, experiment_id: str, cache_dir: str, comp_name: str): + """Instantiate the OpenMS-Insight component for a layout cell, or None.""" + builder = COMPONENT_BUILDERS.get(comp_name) + if builder is None: + return None + return builder(file_manager, experiment_id, cache_dir) + + +def render_experiment_panel( + experiment_id: str, + layout_info_per_exp: List[List[str]], + file_manager, + panel_index: int, +): + """Render one experiment's [row][col] grid with its OWN isolated StateManager. + + The StateManager uses a DISTINCT session_key per experiment so selections made + in this panel do not leak into other side-by-side panels. + """ + session_key = f"svc_state_deconv_{experiment_id}_{panel_index}" + state_manager = StateManager(session_key=session_key) + cache_dir = _component_cache_dir(file_manager, experiment_id) + + for row_index, row in enumerate(layout_info_per_exp): + columns = st.columns(len(row)) + for col, (col_index, comp_name) in zip(columns, enumerate(row)): + with col: + component = build_component( + file_manager, experiment_id, cache_dir, comp_name + ) + if component is None: + st.warning(f"No data for '{comp_name}'.") + continue + key = f"deconv_oi_{panel_index}_{row_index}_{col_index}_{comp_name}" + component(key=key, state_manager=state_manager) diff --git a/content/FLASHQuant/FLASHQuantViewer.py b/content/FLASHQuant/FLASHQuantViewer.py index 05077e9f..0226ed34 100644 --- a/content/FLASHQuant/FLASHQuantViewer.py +++ b/content/FLASHQuant/FLASHQuantViewer.py @@ -4,17 +4,24 @@ from src.workflow.FileManager import FileManager from src.common.common import page_setup, save_params -# from src.render.components import flash_viewer_grid_component, FlashViewerComponent, FLASHQuant -from src.render.render import render_grid + +# NOTE (Stage D rewiring): FLASHQuant now renders through the reusable +# `openms_insight.FeatureView` component instead of the bespoke +# `flash_viewer_grid` / `FLASHQuantView` path. The old render path is left +# importable on purpose (do NOT delete) so it can be restored or compared. +# from src.render.components import flash_viewer_grid_component, FlashViewerComponent, FLASHQuant +# from src.render.render import render_grid +from openms_insight import FeatureView # page initialization params = page_setup() # Get available results +workspace = st.session_state["workspace"] file_manager = FileManager( - st.session_state["workspace"], - Path(st.session_state['workspace'], 'flashquant', 'cache') + workspace, + Path(workspace, 'flashquant', 'cache') ) results = file_manager.get_results_list( ['quant_dfs'] @@ -26,22 +33,29 @@ st.stop() # Map names to index -name_to_index = {n : i for i, n in enumerate(results)} +name_to_index = {n: i for i, n in enumerate(results)} -# for only single experiment on one view +# FLASHQuant is a single-experiment, single-component page (no cross-linking, +# no configurable grid). Pick one experiment and render one FeatureView for it. st.selectbox("choose experiment", results, key="selected_experiment0_quant") selected_exp0 = st.session_state.selected_experiment0_quant -render_grid( - st.session_state.selected_experiment0_quant, [['quant_visualization']], - file_manager, 'flashquant', 'selected_experiment0_quant' +# Load the parsed feature-group frame produced by src/parse/flashquant.py +# (`connectTraceWithResult`): the 12 scalar columns plus the per-feature-group +# array columns Charges / IsotopeIndices / CentroidMzs / RTs / MZs / Intensities, +# where RTs/MZs/Intensities elements are comma-joined point strings. FeatureView +# consumes this frame directly (no transformation needed). +quant_df = file_manager.get_results(selected_exp0, ['quant_dfs'])['quant_dfs'] + +# Cache id is per-experiment so switching experiments yields an independent, +# correctly-scoped cache and selection. The cache lives under the FLASHQuant +# workspace cache directory. +feature_view = FeatureView( + cache_id=f'flashquant_{selected_exp0}', + data=quant_df, + cache_path=str(Path(workspace, 'flashquant', 'cache', 'featureview')), ) - -# # Get data -# quant_df = file_manager.get_results(selected_exp0, 'quant_dfs')['quant_dfs'] - -# component = [[FlashViewerComponent(FLASHQuant())]] -# flash_viewer_grid_component(components=component, data={'quant_data': quant_df, 'dataset': selected_exp0}, component_key='flash_viewer_grid') +feature_view(key=f'flashquant_featureview_{selected_exp0}') save_params(params) diff --git a/requirements.txt b/requirements.txt index 8fcf0064..b693ef64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -140,6 +140,9 @@ xlsxwriter scipy>=1.15 polars>=1.0.0 +# OpenMS-Insight visualization components (FLASHQuant FeatureView, Stage D) +openms-insight>=0.1.11 + # Redis Queue dependencies (for online mode) redis>=5.0.0 rq>=1.16.0 diff --git a/settings.json b/settings.json index 52e8063d..2ffa04fb 100644 --- a/settings.json +++ b/settings.json @@ -20,6 +20,7 @@ }, "online_deployment": false, "enable_workspaces": true, + "use_openms_insight_viewer": true, "test": true, "workspaces_dir": "..", "local_data_dir": "", diff --git a/src/parse/deconv.py b/src/parse/deconv.py index addeb3b6..2153eb89 100644 --- a/src/parse/deconv.py +++ b/src/parse/deconv.py @@ -10,6 +10,174 @@ # pushdown reads only the matching group(s) instead of the whole file. SPECTRA_ROW_GROUP_SIZE = 64 +# Long-format (one-row-per-peak / one-row-per-mass) frames are consumed by the +# OpenMS-Insight components (LinePlot / Table / Scatter3D), which filter by COLUMN +# VALUE (filters={'scanIndex':'index'}) rather than by the old `iloc[scanIndex]` +# ROW-INDEX path in src/render/update.py. These producers are ADDITIVE: the legacy +# array-per-scan frames (deconv_spectrum, anno_spectrum, combined_spectrum, +# mass_table) are still stored unchanged for the old render path; the long frames +# are stored under separate `*_long` tags so both can coexist during Stage B. +# +# Exploded rows are sorted/grouped by `index` so a value-filter `index == k` +# reads exactly the rows the old `iloc[k]` array slice produced (the legacy frames +# are built with with_row_index + sort('index'), so row position == index value). +# Long peak frames use a slightly larger row group (predicate pushdown is by +# value, and per-scan peak counts are modest) than the array frames. +LONG_ROW_GROUP_SIZE = 10_000 + + +def _explode_long_by_position(indexed_lf, id_col, value_exprs): + """Explode parallel per-scan list columns into one row per position. + + This reproduces, exactly, the legacy FLASHApp Vue per-column expansion + (``TabulatorMassTable.vue`` ``tableData`` and the lineplot/3D consumers): + each per-scan list column is laid out by POSITION ``0..L-1`` independently; + the number of rows for a scan is the MAXIMUM list length across the supplied + columns, and any column shorter than that maximum yields ``null`` for the + missing trailing positions (the JS ``undefined``). Columns are therefore + ALIGNED BY POSITION, never lock-stepped — important because in real + FLASHDeconv output ``mz_array``/``intensity_array`` (the full spectrum) can be + LONGER than the per-mass ``MinCharges``/``SignalPeaks`` axis, and the legacy + UI pads the short columns with blanks rather than truncating. + + Args: + indexed_lf: a polars LazyFrame carrying an integer ``index`` column. + id_col: name of the per-scan position id column to emit (``peak_id`` or + ``mass_id``) — the 0-based row position within the scan. + value_exprs: list of ``(out_name, list_expr)``. ``list_expr`` is a polars + expression evaluating to a per-scan list; its element at position + ``id_col`` becomes the scalar ``out_name`` (null when out of range). + + Returns: + LazyFrame with columns ``index``, ``id_col``, then each ``out_name``, + sorted by ``index`` then ``id_col``. Scans whose columns are all empty + contribute 0 rows (matching the old ``iloc[k]`` empty-array slice). + """ + out_names = [name for name, _ in value_exprs] + + # Per-scan max length across all contributing list columns → number of + # positions to emit. (max of list.len() over the columns.) + max_len = value_exprs[0][1].list.len() + for _, expr in value_exprs[1:]: + max_len = pl.max_horizontal(max_len, expr.list.len()) + + lf = ( + indexed_lf + .select( + [pl.col("index")] + + [expr.alias(name) for name, expr in value_exprs] + + [pl.int_ranges(0, max_len).alias(id_col)] + ) + .explode(id_col) + # Empty scans explode to a single null-id row; drop so they contribute 0 rows. + .filter(pl.col(id_col).is_not_null()) + # Gather each column's value at the row's position (null when the column is + # shorter than this position — the legacy `undefined` cell). + .with_columns( + [pl.col(name).list.get(pl.col(id_col), null_on_oob=True).alias(name) + for name in out_names] + ) + .sort(["index", id_col]) + ) + return lf.select(["index", id_col] + out_names) + + +def deconv_spectrum_long(pl_deconv_indexed): + """One row per deconvolved peak: index, peak_id, MonoMass, SumIntensity. + + Long-format replacement for the array-valued ``deconv_spectrum`` frame, + consumed by ``LinePlot(filters={'scanIndex':'index'}, x_column='MonoMass', + y_column='SumIntensity')``. + """ + return _explode_long_by_position( + pl_deconv_indexed, + "peak_id", + [("MonoMass", pl.col("mz_array")), + ("SumIntensity", pl.col("intensity_array"))], + ) + + +def anno_spectrum_long(pl_anno_indexed): + """One row per annotated/raw peak: index, peak_id, MonoMass_Anno, + SumIntensity_Anno. + + Long-format replacement for the array-valued ``anno_spectrum`` frame, + consumed by ``LinePlot(filters={'scanIndex':'index'}, + x_column='MonoMass_Anno', y_column='SumIntensity_Anno')``. + """ + return _explode_long_by_position( + pl_anno_indexed, + "peak_id", + [("MonoMass_Anno", pl.col("mz_array")), + ("SumIntensity_Anno", pl.col("intensity_array"))], + ) + + +def combined_spectrum_long(pl_deconv_indexed): + """One row per deconvolved peak with a signal-membership flag. + + Columns: index, peak_id, MonoMass, SumIntensity, is_signal (bool). + + ``is_signal`` is True when the corresponding per-mass entry of the nested + ``SignalPeaks`` column is non-empty, i.e. the deconvolved mass at that + position has at least one matched signal peak (mirrors the per-mass alignment + the 3D plot uses: ``Plotly3Dplot.vue`` indexes ``SignalPeaks[massIndex]`` by + the same position). ``SignalPeaks`` is the per-mass axis and in real output + can be SHORTER than ``mz_array``; positions beyond its length therefore have + no signal entry and are flagged ``False`` (parity with the JS ``undefined`` + → no-signal). This is the long-format counterpart of the array-valued + ``combined_spectrum`` deconv side; the annotated overlay is provided + separately by ``anno_spectrum_long`` (the OpenMS-Insight LinePlot reads the + 2nd series from its own ``x2_column``/``y2_column`` frame). + """ + # Per-mass boolean list: True where that mass has >=1 signal peak. Aligned to + # the SignalPeaks (per-mass) axis; _explode_long_by_position gathers it by the + # same position id as MonoMass and yields null past its end, coerced to False. + is_signal_list = pl.col("SignalPeaks").list.eval(pl.element().list.len() > 0) + lf = _explode_long_by_position( + pl_deconv_indexed, + "peak_id", + [("MonoMass", pl.col("mz_array")), + ("SumIntensity", pl.col("intensity_array")), + ("is_signal", is_signal_list)], + ) + return lf.with_columns(pl.col("is_signal").fill_null(False)) + + +def mass_table_long(pl_deconv_indexed): + """One row per mass: index, mass_id, plus scalar mass-table fields. + + Long-format replacement for the array-valued ``mass_table`` frame. Each row is + one deconvolved mass within a scan; ``MonoMass``/``SumIntensity`` and the + per-mass charge/isotope/score columns become scalars. + + Consumed by ``Table(interactivity={'massIndex':'mass_id'}, + filters={'scanIndex':'index'})``: clicking a row sets ``massIndex`` to the + row's ``mass_id``, and the table is filtered to the selected scan via + ``index``. ``mass_id`` is the 0-based position of the mass within its scan, + matching the array-subscript semantics the 3D plot uses for ``massIndex``. + + Columns are aligned BY POSITION (not lock-stepped): the legacy + ``TabulatorMassTable.vue`` builds one row per position up to the MAX array + length across the required columns, leaving blanks where a column is shorter. + In real FLASHDeconv output ``MonoMass``/``SumIntensity`` (the full spectrum + ``mz_array``/``intensity_array``) may be LONGER than the per-mass charge/ + isotope/score arrays; those trailing rows therefore carry the mass/intensity + with ``null`` charge/isotope/score cells, exactly as the old UI rendered them. + """ + value_exprs = [ + ("MonoMass", pl.col("mz_array")), + ("SumIntensity", pl.col("intensity_array")), + ("MinCharges", pl.col("MinCharges")), + ("MaxCharges", pl.col("MaxCharges")), + ("MinIsotopes", pl.col("MinIsotopes")), + ("MaxIsotopes", pl.col("MaxIsotopes")), + ("CosineScore", pl.col("cos")), + ("SNR", pl.col("snr")), + ("QScore", pl.col("qscore")), + ] + return _explode_long_by_position(pl_deconv_indexed, "mass_id", value_exprs) + def parseDeconv( file_manager, dataset_id, out_deconv_mzML, anno_annotated_mzML, spec1_tsv=None, spec2_tsv=None, logger=None @@ -111,6 +279,13 @@ def parseDeconv( ) file_manager.store_data(dataset_id, 'anno_spectrum', anno_spectrum_lazy, row_group_size=SPECTRA_ROW_GROUP_SIZE) + # anno_spectrum_long - long-format (one row per peak) for OpenMS-Insight LinePlot + file_manager.store_data( + dataset_id, 'anno_spectrum_long', + anno_spectrum_long(pl_anno_indexed), + row_group_size=LONG_ROW_GROUP_SIZE, + ) + logger.log("40.0 %", level=2) # mass_table - using native polars LazyFrame operations @@ -132,6 +307,13 @@ def parseDeconv( ) file_manager.store_data(dataset_id, 'mass_table', mass_table_lazy, row_group_size=SPECTRA_ROW_GROUP_SIZE) + # mass_table_long - long-format (one row per mass) for OpenMS-Insight Table + file_manager.store_data( + dataset_id, 'mass_table_long', + mass_table_long(pl_deconv_indexed), + row_group_size=LONG_ROW_GROUP_SIZE, + ) + logger.log("50.0 %", level=2) # sequence_view - using native polars LazyFrame operations @@ -160,6 +342,13 @@ def parseDeconv( ) file_manager.store_data(dataset_id, 'deconv_spectrum', deconv_spectrum_lazy, row_group_size=SPECTRA_ROW_GROUP_SIZE) + # deconv_spectrum_long - long-format (one row per peak) for OpenMS-Insight LinePlot + file_manager.store_data( + dataset_id, 'deconv_spectrum_long', + deconv_spectrum_long(pl_deconv_indexed), + row_group_size=LONG_ROW_GROUP_SIZE, + ) + logger.log("70.0 %", level=2) # anno & deconv spectrum (combined_spectrum) - using native polars LazyFrame join @@ -184,6 +373,15 @@ def parseDeconv( ) file_manager.store_data(dataset_id, 'combined_spectrum', combined_spectrum_lazy, row_group_size=SPECTRA_ROW_GROUP_SIZE) + # combined_spectrum_long - long-format deconv peaks + is_signal flag for + # OpenMS-Insight LinePlot (primary series). The annotated overlay (2nd series) + # is the separate anno_spectrum_long frame wired via x2_column/y2_column. + file_manager.store_data( + dataset_id, 'combined_spectrum_long', + combined_spectrum_long(pl_deconv_indexed), + row_group_size=LONG_ROW_GROUP_SIZE, + ) + logger.log("80.0 %", level=2) # 3D_SN_plot - using native polars LazyFrame operations diff --git a/tests/test_deconv_long_format.py b/tests/test_deconv_long_format.py new file mode 100644 index 00000000..25b00355 --- /dev/null +++ b/tests/test_deconv_long_format.py @@ -0,0 +1,161 @@ +""" +Tests for the Stage B long-format producers in src/parse/deconv.py. + +FLASHApp's legacy render path filters per-scan spectra/masses by ROW INDEX +(``iloc[scanIndex]``) and stores arrays-per-scan. The OpenMS-Insight components +filter by COLUMN VALUE and expect LONG format (one row per peak/mass). These +tests validate the additive long-format producers: + + - row-count fidelity: exploded rows == legacy per-column max-length expansion + (the TabulatorMassTable.vue ``forEach``-per-column semantics); + - index-filter parity: ``filter(index == k)`` reproduces, position by position, + the legacy ``iloc[k]`` array contents (with shorter columns padded to None); + - ragged-scan handling: when ``mz_array`` (full spectrum) is longer than the + per-mass charge/score arrays, trailing rows carry mass/intensity with null + charge/score cells; ``is_signal`` is False past the SignalPeaks axis; + - id columns: ``peak_id`` / ``mass_id`` are 0-based positions within each scan, + and the deconv peak axis and mass-table mass axis are aligned. + +The producers are pure polars (no Streamlit / pyopenms), so they are unit +testable without booting the app. ``pyopenms`` is stubbed at import time only +because src/parse/deconv.py imports src/parse/masstable.py at module load. +""" + +import os +import sys +import types + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Stub pyopenms so importing src.parse.deconv works without the native dep +# (the long-format producers do not use it). +if "pyopenms" not in sys.modules: + _m = types.ModuleType("pyopenms") + for _a in ("MSExperiment", "MzMLFile", "SpectrumLookup", "Constants"): + setattr(_m, _a, type(_a, (), {"PROTON_MASS_U": 1.0, "C13C12_MASSDIFF_U": 1.0})) + sys.modules["pyopenms"] = _m + +import polars as pl + +from src.parse.deconv import ( + anno_spectrum_long, + combined_spectrum_long, + deconv_spectrum_long, + mass_table_long, +) + + +def _deconv(): + # scan 0 is RAGGED: mz_array length 4 > per-mass arrays length 3. + # scan 1 is EMPTY. scans 2,3 have 2 and 1 masses. + return pl.DataFrame( + { + "mz_array": [[1000.1, 2000.2, 3000.3, 4000.4], [], [500.5, 600.6], [777.7]], + "intensity_array": [[10.0, 20.0, 30.0, 40.0], [], [5.0, 6.0], [7.0]], + "MinCharges": [[1, 2, 3], [], [1, 2], [4]], + "MaxCharges": [[5, 6, 7], [], [3, 4], [8]], + "MinIsotopes": [[0, 1, 2], [], [0, 1], [3]], + "MaxIsotopes": [[4, 5, 6], [], [2, 3], [7]], + "cos": [[0.9, 0.8, 0.7], [], [0.95, 0.85], [0.6]], + "snr": [[11.0, 12.0, 13.0], [], [14.0, 15.0], [16.0]], + "qscore": [[0.99, 0.98, 0.97], [], [0.96, 0.95], [0.94]], + "SignalPeaks": [ + [[[0.0, 1000.1, 10.0, 1.0]], [], [[2.0, 3000.3, 30.0, 3.0]]], + [], + [[[0.0, 500.5, 5.0, 1.0]], []], + [[]], + ], + } + ).with_row_index("index") + + +def _anno(): + return pl.DataFrame( + { + "mz_array": [[101.1, 102.2], [201.1], [], [401.1, 402.2, 403.3]], + "intensity_array": [[1.0, 2.0], [3.0], [], [4.0, 5.0, 6.0]], + } + ).with_row_index("index") + + +def _max_len_expansion(row, cols): + arrays = {c: list(row[c]) for c in cols} + n = max((len(a) for a in arrays.values()), default=0) + return [{c: (arrays[c][i] if i < len(arrays[c]) else None) for c in arrays} for i in range(n)] + + +def test_deconv_spectrum_long_schema_and_rowcount(): + df = deconv_spectrum_long(_deconv().lazy()).collect() + assert df.columns == ["index", "peak_id", "MonoMass", "SumIntensity"] + # 4 + 0 + 2 + 1 = 7 + assert df.height == 7 + + +def test_anno_spectrum_long_index_filter_parity(): + anno = _anno() + df = anno_spectrum_long(anno.lazy()).collect() + assert df.columns == ["index", "peak_id", "MonoMass_Anno", "SumIntensity_Anno"] + apd = anno.to_pandas() + for k in range(len(apd)): + sub = df.filter(pl.col("index") == k).sort("peak_id") + want_mass = list(apd.iloc[k]["mz_array"]) + assert sub["MonoMass_Anno"].to_list() == want_mass + assert sub["peak_id"].to_list() == list(range(len(want_mass))) + + +def test_mass_table_long_ragged_padding(): + deconv = _deconv() + df = mass_table_long(deconv.lazy()).collect() + expected_cols = [ + "index", "mass_id", "MonoMass", "SumIntensity", + "MinCharges", "MaxCharges", "MinIsotopes", "MaxIsotopes", + "CosineScore", "SNR", "QScore", + ] + assert df.columns == expected_cols + # Scan 0 has 4 mass/intensity positions but only 3 charge positions → + # row at mass_id 3 carries MonoMass=4000.4 with null MinCharges. + scan0 = df.filter(pl.col("index") == 0).sort("mass_id") + assert scan0.height == 4 + last = scan0.row(3, named=True) + assert last["MonoMass"] == 4000.4 + assert last["MinCharges"] is None + assert last["QScore"] is None + # First three rows have full charge/score data. + assert scan0.row(0, named=True)["MinCharges"] == 1 + + +def test_mass_table_long_rowcount_and_empty_scan(): + df = mass_table_long(_deconv().lazy()).collect() + # max-length per scan: 4 + 0 + 2 + 1 = 7 + assert df.height == 7 + # Empty scan contributes no rows. + assert df.filter(pl.col("index") == 1).height == 0 + + +def test_combined_spectrum_long_is_signal(): + deconv = _deconv() + df = combined_spectrum_long(deconv.lazy()).collect() + assert df.columns == ["index", "peak_id", "MonoMass", "SumIntensity", "is_signal"] + dpd = deconv.to_pandas() + for r in df.iter_rows(named=True): + sp = dpd[dpd["index"] == r["index"]].iloc[0]["SignalPeaks"] + pid = r["peak_id"] + want = (pid < len(sp)) and (len(sp[pid]) > 0) + assert bool(r["is_signal"]) == want + # Ragged past-end position (scan 0, peak_id 3) must be is_signal False. + row3 = df.filter((pl.col("index") == 0) & (pl.col("peak_id") == 3)).row(0, named=True) + assert row3["is_signal"] is False + + +def test_peak_id_and_mass_id_share_mass_axis(): + deconv = _deconv() + ds = deconv_spectrum_long(deconv.lazy()).collect() + mt = mass_table_long(deconv.lazy()).collect() + join = ds.join( + mt.select(["index", "mass_id", "MonoMass"]), + left_on=["index", "peak_id"], + right_on=["index", "mass_id"], + how="inner", + suffix="_mt", + ) + assert (join["MonoMass"] == join["MonoMass_mt"]).all() From e0d1e8f9a56022eed9f23f7cf9d08c5585e91de9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 23:14:59 +0000 Subject: [PATCH 02/10] Render FLASHTnT via OpenMS-Insight components MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage C of the migration off flash_viewer_grid. New FLASHTnTViewerOI mirrors the FLASHDeconv viewer: protein/tag Tables, SequenceView (coverage + fixed mods), the combined-spectrum LinePlot with tagger overlay (2nd series + signal-peak markers + sequence-tag highlight), TnT-mode DensityPlot, and heatmaps — wired through one StateManager per experiment panel (distinct session_key => side-by-side isolation) with full [experiment][row][col] layout parity. Routed behind the same use_openms_insight_viewer flag; legacy render_grid retained. Proteoform->scan resolution: add build_proteoform_scan_frame (additive, legacy build_proteoform_scan_map untouched) so proteoform selections resolve to deconv scans and components value-filter by proteoform_index, reproducing the old PyArrow pushdown. Tag selection resolves tag m/z values into tagMasses for the spectrum overlay. InternalFragmentMap remains disabled (as in the legacy path); enabling it needs the per-proteoform sequence_data store extended with internal- fragment arrays. --- content/FLASHTnT/FLASHTnTViewer.py | 57 +++- content/FLASHTnT/FLASHTnTViewerOI.py | 459 +++++++++++++++++++++++++++ src/render/scan_resolution.py | 50 +++ 3 files changed, 560 insertions(+), 6 deletions(-) create mode 100644 content/FLASHTnT/FLASHTnTViewerOI.py diff --git a/content/FLASHTnT/FLASHTnTViewer.py b/content/FLASHTnT/FLASHTnTViewer.py index e94392f3..327a72d8 100644 --- a/content/FLASHTnT/FLASHTnTViewer.py +++ b/content/FLASHTnT/FLASHTnTViewer.py @@ -4,12 +4,39 @@ from src.common.common import page_setup, save_params from src.workflow.FileManager import FileManager +# Legacy bespoke-grid render path (kept importable until OI integration is verified). from src.render.render import render_grid +# New OpenMS-Insight viewer (Stage C). Selected via the +# `use_openms_insight_viewer` settings flag (defaults True). +from content.FLASHTnT.FLASHTnTViewerOI import render_experiment_panel + + +def _use_oi_viewer(): + return st.session_state.get("settings", {}).get( + "use_openms_insight_viewer", True + ) + + +def render_panel(experiment_id, layout_info_per_exp, file_manager, identifier, + grid_key, panel_index): + """Render one experiment panel via the configured viewer. + + Routes to the new OpenMS-Insight viewer when enabled, else the legacy grid. + """ + if _use_oi_viewer(): + render_experiment_panel( + experiment_id, layout_info_per_exp, file_manager, panel_index + ) + else: + render_grid( + experiment_id, layout_info_per_exp, file_manager, + 'flashtnt', identifier, grid_key + ) DEFAULT_LAYOUT = [ - ['protein_table'], - ['sequence_view'], + ['protein_table'], + ['sequence_view'], ['tag_table'], ['combined_spectrum'] ] @@ -81,7 +108,11 @@ def validate_selected_index(file_manager, selected_experiment): on_change=select_experiment ) if 'selected_experiment0_tagger' in st.session_state: - render_grid(st.session_state.selected_experiment0_tagger, layout[0], file_manager, 'flashtnt', 'selected_experiment0_tagger') + render_panel( + st.session_state.selected_experiment0_tagger, layout[0], + file_manager, 'selected_experiment0_tagger', + 'flash_viewer_grid_0', panel_index=0 + ) with c2: st.selectbox( "choose experiment", display_names, @@ -90,7 +121,12 @@ def validate_selected_index(file_manager, selected_experiment): on_change=select_experiment ) if f"selected_experiment1_tagger" in st.session_state: - render_grid(st.session_state.selected_experiment1_tagger, layout[1], file_manager, 'flashtnt', 'selected_experiment1_tagger', 'flash_viewer_grid_1') + with st.spinner('Loading component...'): + render_panel( + st.session_state.selected_experiment1_tagger, layout[1], + file_manager, 'selected_experiment1_tagger', + 'flash_viewer_grid_1', panel_index=1 + ) else: @@ -103,7 +139,11 @@ def validate_selected_index(file_manager, selected_experiment): ) if 'selected_experiment0_tagger' in st.session_state: - render_grid(st.session_state.selected_experiment0_tagger, layout[0], file_manager, 'flashtnt', 'selected_experiment0_tagger') + render_panel( + st.session_state.selected_experiment0_tagger, layout[0], + file_manager, 'selected_experiment0_tagger', + 'flash_viewer_grid', panel_index=0 + ) ### for multiple experiments on one view if len(layout) > 1: @@ -122,6 +162,11 @@ def validate_selected_index(file_manager, selected_experiment): # if #experiment input files are less than #layouts, all the pre-selection will be the first experiment if f"selected_experiment{exp_index}_tagger" in st.session_state: - render_grid(st.session_state["selected_experiment%d_tagger" % exp_index], layout[exp_index], file_manager, 'flashtnt', f"selected_experiment{exp_index}_tagger", 'flash_viewer_grid_%d' % exp_index) + render_panel( + st.session_state["selected_experiment%d_tagger" % exp_index], + layout[exp_index], file_manager, + f"selected_experiment{exp_index}_tagger", + 'flash_viewer_grid_%d' % exp_index, panel_index=exp_index + ) save_params(params) \ No newline at end of file diff --git a/content/FLASHTnT/FLASHTnTViewerOI.py b/content/FLASHTnT/FLASHTnTViewerOI.py new file mode 100644 index 00000000..d2be1e8a --- /dev/null +++ b/content/FLASHTnT/FLASHTnTViewerOI.py @@ -0,0 +1,459 @@ +"""FLASHTnT viewer rendered entirely with OpenMS-Insight components (Stage C). + +This is the NEW viewer for the FLASHApp -> OpenMS-Insight visualization migration, +mirroring ``content/FLASHDeconv/FLASHDeconvViewerOI.py`` (Stage B). It renders the +FLASHTnT (tagger / top-down identification) workflow using the reusable +``openms_insight`` component library (``Table``, ``LinePlot``, ``SequenceView``, +``DensityPlot``, ``Heatmap``) instead of the bespoke ``flash_viewer_grid`` Vue grid +in ``src/render/*``. + +Design goals (see ``/home/user/parity/STRATEGY.md`` §4/§5 and Stage C edges): + +* ONE shared ``StateManager`` per rendered experiment panel, keyed by a DISTINCT + ``session_key`` (``svc_state_tnt__``) so selections never + leak between side-by-side experiment panels (HARD edge #6). +* Layout parity: the ``[experiment][row][col]`` nested grid is reproduced with + ``st.columns`` per row (<=3 cols), rows stacked; multi-experiment side-by-side + uses a top-level ``st.columns``. +* TnT-specific wiring (STRATEGY §2/§3): + - ``protein_dfs`` is row-per-proteoform with ``index``; the protein Table sets + ``proteinIndex`` on click. + - ``tag_dfs`` is row-per-tag with ``Scan``/``ProteinIndex``/``StartPos``/``EndPos``/``mzs``. + - The per-proteoform ``sequence_data`` store (``sequence_data_store.py``) carries + ``coverage``/``maxCoverage`` keyed by ``proteoform_index``. + - **Scan resolution (HARD edge #3):** a proteoform selection must resolve to the + correct deconv scan. ``build_proteoform_scan_frame`` (additive helper in + ``src/render/scan_resolution.py``, reproducing the legacy + ``build_proteoform_scan_map`` PyArrow pushdown) surfaces ``proteoform_index -> + (scan, deconv_index)`` as COLUMNS. We stamp a ``proteoform_index`` column onto + the combined-spectrum / sequence-peak frames by joining on the deconv ``index``, + so the OpenMS-Insight components value-filter + (``filters={'proteinIndex': 'proteoform_index'}``) exactly like Deconv filters by + scan. + - **Tagger overlay (HARD edge #1):** the Tag Table sets ``tagData`` to the clicked + tag's list of masses; the combined-spectrum ``LinePlot`` highlights peaks whose + ``MonoMass`` matches a selected tag mass within ``abs(Δ) < 1e-5``. + +NOTE: FLASHTnT runs BOTH ``parseDeconv`` and ``parseTnT`` on the same dataset +(``src/Workflow.py``), so the Deconv long-format frames (``combined_spectrum_long``, +``scan_table``, heatmaps) are present alongside the TnT frames (``protein_dfs``, +``tag_dfs``, ``sequence_data``, ``settings``, ``density_id_target``/``density_id_decoy``). + +The OLD render path (``src/render/render.py`` / ``flash_viewer_grid``) is left intact +and importable; the page chooses which path to use via ``use_openms_insight_viewer``. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, List, Optional + +import polars as pl +import streamlit as st + +from openms_insight import ( + DensityPlot, + Heatmap, + LinePlot, + SequenceView, + StateManager, + Table, +) + +from src.render.scan_resolution import build_proteoform_scan_frame + +# FLASHApp StateTracker keys reused as OpenMS-Insight identifiers so state flows +# across components exactly like the legacy grid. +PROTEIN_KEY = "proteinIndex" +# Tag selection: the Tag Table sets a SCALAR `tagData` to the clicked tag's +# `TagIndex` (a scalar — list-valued interactivity columns are not supported by +# the OpenMS-Insight Table, which calls `.item()` on the cell). The viewer then +# resolves that TagIndex to the tag's list of masses and publishes it under +# `TAG_MASSES_KEY`, which the combined-spectrum LinePlot consumes for the tagger +# overlay (`tag_filters={'tagMasses': 'MonoMass'}`). +TAG_KEY = "tagData" +TAG_MASSES_KEY = "tagMasses" +MASS_KEY = "massIndex" + + +def _component_cache_dir(file_manager, experiment_id: str) -> str: + """Directory under the workspace cache where OI component caches are written.""" + cache_root = Path(file_manager.cache_path, "oi_components_tnt", str(experiment_id)) + cache_root.mkdir(parents=True, exist_ok=True) + return str(cache_root) + + +def _lazy(file_manager, experiment_id: str, name_tag: str) -> Optional[pl.LazyFrame]: + """Load a stored frame as a polars LazyFrame, or None if absent.""" + if not file_manager.result_exists(experiment_id, name_tag): + return None + return file_manager.get_results( + experiment_id, [name_tag], use_polars=True + )[name_tag] + + +def _pandas(file_manager, experiment_id: str, name_tag: str): + """Load a stored frame as pandas (for the precomputed density frames), or None.""" + if not file_manager.result_exists(experiment_id, name_tag): + return None + return file_manager.get_results(experiment_id, [name_tag])[name_tag] + + +# --------------------------------------------------------------------------- +# Scan resolution: proteoform_index -> deconv index, exposed as a frame so the +# spectrum / sequence components can value-filter by proteoform. +# --------------------------------------------------------------------------- + +def _proteoform_scan_frame(file_manager, experiment_id: str) -> Optional[pl.DataFrame]: + """proteoform_index / scan / deconv_index frame for this experiment, or None. + + Reproduces the legacy ``build_proteoform_scan_map`` (PyArrow pushdown in + ``src/render/update.py``) by reading the already-stored ``protein_dfs`` and + ``scan_table`` frames. Cached in session state per experiment to avoid + recomputing on every rerun. + """ + protein = _lazy(file_manager, experiment_id, "protein_dfs") + scan_table = _lazy(file_manager, experiment_id, "scan_table") + if protein is None or scan_table is None: + return None + protein_df = protein.select(["index", "Scan"]).collect() + scan_df = scan_table.select(["index", "Scan"]).collect() + return build_proteoform_scan_frame(protein_df, scan_df) + + +def _stamp_proteoform_index( + spectrum_lf: pl.LazyFrame, scan_frame: pl.DataFrame +) -> pl.LazyFrame: + """Join a deconv-``index``-keyed long spectrum frame with the proteoform/scan + frame so each peak row carries the ``proteoform_index`` whose scan it belongs + to. This converts the proteoform selection into a plain value filter on the + spectrum (``filters={'proteinIndex': 'proteoform_index'}``). + + A scan may map to multiple proteoforms; the inner join replicates the peak + rows per proteoform so each proteoform selection sees its scan's peaks (the + legacy path resolves proteoform->scan then pushes that single scan down, which + is equivalent for the selected proteoform).""" + map_lf = scan_frame.lazy().select( + pl.col("deconv_index").alias("index"), + pl.col("proteoform_index"), + ) + return spectrum_lf.join(map_lf, on="index", how="inner") + + +# --------------------------------------------------------------------------- +# Per-component builders. Each returns an OpenMS-Insight component instance, or +# None when the underlying data frame is missing (component is silently skipped). +# --------------------------------------------------------------------------- + +def _build_protein_table(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "protein_dfs") + if data is None: + return None + # Protein table: clicking a row sets proteinIndex to the row's `index`. + return Table( + cache_id=f"protein_table_{experiment_id}", + data=data, + interactivity={PROTEIN_KEY: "index"}, + index_field="index", + title="Protein Table", + cache_path=cache_dir, + ) + + +def _build_tag_table(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "tag_dfs") + if data is None: + return None + scan_frame = _proteoform_scan_frame(file_manager, experiment_id) + if scan_frame is None: + return None + # Tags are scan (spectrum) data. To filter by the SELECTED PROTEOFORM we need + # a proteoform_index column on each tag row; resolve via the proteoform's scan + # (Scan column on the tag) so a proteoform selection shows its scan's tags + # (parity with the legacy filter_data Tag-Table path stamping ProteinIndex). + map_lf = scan_frame.lazy().select( + pl.col("scan").alias("Scan"), + pl.col("proteoform_index"), + ) + tag_lf = data.join(map_lf, on="Scan", how="inner") + # Clicking a tag row sets the SCALAR `tagData` to the row's `TagIndex`. The + # viewer resolves that index to the tag's masses (see _resolve_tag_masses) and + # publishes them for the combined-spectrum tagger overlay. A list-valued + # interactivity column cannot be used here because the OI Table calls + # `.item()` on the clicked cell. + return Table( + cache_id=f"tag_table_{experiment_id}", + data=tag_lf, + filters={PROTEIN_KEY: "proteoform_index"}, + interactivity={TAG_KEY: "TagIndex"}, + index_field="TagIndex", + title="Tag Table", + cache_path=cache_dir, + ) + + +def _tag_mass_lookup(file_manager, experiment_id: str) -> dict: + """Map ``TagIndex`` -> list[float] of the tag's masses (parsed from the + comma-joined ``mzs`` string with its trailing comma). Used to resolve a tag + selection into the mass list the combined-spectrum tagger overlay matches.""" + tags = _lazy(file_manager, experiment_id, "tag_dfs") + if tags is None: + return {} + df = ( + tags.select(["TagIndex", "mzs"]) + .with_columns( + pl.col("mzs") + .str.strip_chars(",") + .str.split(",") + .list.eval(pl.element().cast(pl.Float64, strict=False)) + .alias("tag_masses") + ) + .collect() + ) + return { + int(ti): [m for m in masses if m is not None] + for ti, masses in zip(df["TagIndex"], df["tag_masses"].to_list()) + } + + +def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None: + """Resolve the selected ``tagData`` (a ``TagIndex``) to its list of masses and + publish under ``tagMasses`` so the combined-spectrum LinePlot tagger overlay + can read it. Clears ``tagMasses`` when no tag is selected.""" + tag_index = state_manager.get_selection(TAG_KEY) + if tag_index is None: + state_manager.clear_selection(TAG_MASSES_KEY) + return + lookup = _tag_mass_lookup(file_manager, experiment_id) + masses = lookup.get(int(tag_index)) + if masses: + state_manager.set_selection(TAG_MASSES_KEY, list(masses)) + else: + state_manager.clear_selection(TAG_MASSES_KEY) + + +def _build_sequence_frame( + file_manager, experiment_id: str +) -> Optional[pl.LazyFrame]: + """Build the SequenceView-ready per-proteoform sequence frame. + + Source: the per-proteoform ``sequence_data`` store (keyed by + ``proteoform_index``, carrying the per-residue ``coverage`` / ``maxCoverage`` + of the DISPLAYED proteoform substring and the full-protein ``sequence`` list + + ``proteoform_start``/``proteoform_end``). We reconstruct the displayed + proteoform sequence STRING (the substring the legacy SequenceView rendered) + and attach coverage so OpenMS-Insight SequenceView can shade residues. + + Columns emitted: ``proteoform_index`` (filter key), ``sequence`` (str), + ``precursor_charge`` (=1, neutral/deconvolved peaks), ``coverage`` (list[f64]), + ``maxCoverage`` (f64), ``fixed_modifications`` (list[str]).""" + store = _lazy(file_manager, experiment_id, "sequence_data") + if store is None: + return None + schema = store.collect_schema().names() + want = ["proteoform_index", "sequence", "coverage", "maxCoverage", + "proteoform_start", "proteoform_end", "fixed_modifications"] + cols = [c for c in want if c in schema] + df = store.select(cols).collect() + + sequences: List[str] = [] + for row in df.iter_rows(named=True): + full = row.get("sequence") or [] + start = row.get("proteoform_start") + end = row.get("proteoform_end") + if start is None or end is None or start < 0 or end < 0: + sub = full + else: + sub = full[start:end + 1] + sequences.append("".join(sub)) + + out = pl.DataFrame({ + "proteoform_index": df["proteoform_index"], + "sequence": sequences, + "precursor_charge": [1] * df.height, + }) + if "coverage" in df.columns: + out = out.with_columns(df["coverage"].alias("coverage")) + if "maxCoverage" in df.columns: + out = out.with_columns(df["maxCoverage"].alias("maxCoverage")) + if "fixed_modifications" in df.columns: + out = out.with_columns(df["fixed_modifications"].alias("fixed_modifications")) + return out.lazy() + + +def _build_sequence_view(file_manager, experiment_id: str, cache_dir: str): + seq_frame = _build_sequence_frame(file_manager, experiment_id) + if seq_frame is None: + return None + scan_frame = _proteoform_scan_frame(file_manager, experiment_id) + combined = _lazy(file_manager, experiment_id, "combined_spectrum_long") + if combined is None: + combined = _lazy(file_manager, experiment_id, "deconv_spectrum_long") + peaks = None + if combined is not None and scan_frame is not None: + # Deconv peaks are neutral masses; filter by the proteoform's scan and + # rename to the SequenceView peaks schema (peak_id, mass, intensity). + peaks = ( + _stamp_proteoform_index(combined, scan_frame) + .select( + pl.col("proteoform_index"), + pl.col("peak_id"), + pl.col("MonoMass").alias("mass"), + pl.col("SumIntensity").alias("intensity"), + ) + ) + + settings = _pandas(file_manager, experiment_id, "settings") + settings = dict(settings) if isinstance(settings, dict) else None + + return SequenceView( + cache_id=f"sequence_view_{experiment_id}", + sequence_data=seq_frame, + peaks_data=peaks, + filters={PROTEIN_KEY: "proteoform_index"}, + interactivity={MASS_KEY: "peak_id"}, + deconvolved=True, + compute_fixed_mods=True, + settings=settings, + title="Sequence View", + cache_path=cache_dir, + ) + + +def _build_combined_spectrum(file_manager, experiment_id: str, cache_dir: str): + primary = _lazy(file_manager, experiment_id, "combined_spectrum_long") + if primary is None: + return None + scan_frame = _proteoform_scan_frame(file_manager, experiment_id) + if scan_frame is None: + return None + primary = _stamp_proteoform_index(primary, scan_frame) + + # Annotated overlay (2nd series), stamped + filtered by the same proteoform. + anno = _lazy(file_manager, experiment_id, "anno_spectrum_long") + if anno is not None: + anno = _stamp_proteoform_index(anno, scan_frame) + primary = pl.concat([primary, anno], how="diagonal") + x2, y2 = "MonoMass_Anno", "SumIntensity_Anno" + else: + x2 = y2 = None + + # Combined spectrum: filtered by proteoform (resolved to scan), clicking a + # peak sets massIndex, signal-peak markers via is_signal, and the TAGGER + # OVERLAY highlights peaks whose MonoMass matches a selected tag mass + # (abs(Δ) < 1e-5, FLASHApp PlotlyLineplotTagger parity). The selected tag's + # masses arrive via the `tagData` state value (a list) set by the Tag Table. + return LinePlot( + cache_id=f"combined_spectrum_{experiment_id}", + data=primary, + filters={PROTEIN_KEY: "proteoform_index"}, + interactivity={MASS_KEY: "peak_id"}, + x_column="MonoMass", + y_column="SumIntensity", + signal_peak_column="is_signal", + x2_column=x2, + y2_column=y2, + tag_filters={TAG_MASSES_KEY: "MonoMass"}, + tag_mass_column="MonoMass", + tag_tolerance=1e-5, + title="Augmented Deconvolved Spectrum", + x_label="Monoisotopic Mass", + y_label="Intensity", + cache_path=cache_dir, + ) + + +def _build_id_fdr_plot(file_manager, experiment_id: str, cache_dir: str): + # Precomputed TnT id-FDR density frames (computed in parseTnT with the TnT + # grouping: DECOY_ accession + ProteoformLevelQvalue>0). Literal labels stay + # "QScore"/"Target QScores"/"Decoy QScores" (DensityPlot defaults). + target = _lazy(file_manager, experiment_id, "density_id_target") + decoy = _lazy(file_manager, experiment_id, "density_id_decoy") + if target is None and decoy is None: + return None + return DensityPlot( + cache_id=f"id_fdr_plot_{experiment_id}", + density_target=target, + density_decoy=decoy, + title="Score Distribution", + cache_path=cache_dir, + ) + + +def _build_heatmap( + file_manager, experiment_id: str, cache_dir: str, frame_tag: str, + zoom_id: str, title: str, +): + data = _lazy(file_manager, experiment_id, frame_tag) + if data is None: + return None + return Heatmap( + cache_id=f"{frame_tag}_{experiment_id}", + data=data, + x_column="rt", + y_column="mass", + intensity_column="intensity", + zoom_identifier=zoom_id, + title=title, + x_label="Retention Time", + y_label="Mass", + cache_path=cache_dir, + ) + + +# COMPONENT_NAMES (FLASHTnTLayoutManager) -> builder. +COMPONENT_BUILDERS = { + "protein_table": _build_protein_table, + "sequence_view": _build_sequence_view, + "tag_table": _build_tag_table, + "combined_spectrum": _build_combined_spectrum, + "id_fdr_plot": _build_id_fdr_plot, + "ms1_raw_heatmap": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms1_raw_heatmap", "heatmap_raw", "Raw MS1 Heatmap"), + "ms1_deconv_heat_map": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms1_deconv_heatmap", "heatmap_deconv", "Deconvolved MS1 Heatmap"), + # internal_fragment_map: deferred (disabled in the legacy path too; the + # sequence_data store would need internal-fragment arrays — see module note). +} + + +def build_component(file_manager, experiment_id: str, cache_dir: str, comp_name: str): + """Instantiate the OpenMS-Insight component for a layout cell, or None.""" + builder = COMPONENT_BUILDERS.get(comp_name) + if builder is None: + return None + return builder(file_manager, experiment_id, cache_dir) + + +def render_experiment_panel( + experiment_id: str, + layout_info_per_exp: List[List[str]], + file_manager, + panel_index: int, +): + """Render one experiment's [row][col] grid with its OWN isolated StateManager. + + The StateManager uses a DISTINCT session_key per experiment so selections made + in this panel do not leak into other side-by-side panels (HARD edge #6). + """ + session_key = f"svc_state_tnt_{experiment_id}_{panel_index}" + state_manager = StateManager(session_key=session_key) + cache_dir = _component_cache_dir(file_manager, experiment_id) + + # Resolve the selected tag (scalar TagIndex set by the Tag Table) into its + # list of masses BEFORE rendering so the combined-spectrum tagger overlay + # sees the up-to-date `tagMasses` selection this rerun. + _resolve_tag_masses(file_manager, experiment_id, state_manager) + + for row_index, row in enumerate(layout_info_per_exp): + columns = st.columns(len(row)) + for col, (col_index, comp_name) in zip(columns, enumerate(row)): + with col: + component = build_component( + file_manager, experiment_id, cache_dir, comp_name + ) + if component is None: + st.warning(f"No data for '{comp_name}'.") + continue + key = f"tnt_oi_{panel_index}_{row_index}_{col_index}_{comp_name}" + component(key=key, state_manager=state_manager) diff --git a/src/render/scan_resolution.py b/src/render/scan_resolution.py index a27a1038..3e4c06c9 100644 --- a/src/render/scan_resolution.py +++ b/src/render/scan_resolution.py @@ -1,4 +1,5 @@ import pandas as pd +import polars as pl def build_proteoform_scan_map(protein_df, scan_table_df): @@ -25,3 +26,52 @@ def build_proteoform_scan_map(protein_df, scan_table_df): "deconv_index": int(scan_to_index.loc[scan]), } return result + + +def build_proteoform_scan_frame(protein_df, scan_table_df): + """Polars frame surfacing the proteoform->scan resolution as COLUMNS. + + ADDITIVE helper for the OpenMS-Insight TnT viewer (Stage C). It reproduces + ``build_proteoform_scan_map`` (the legacy PyArrow pushdown in + ``src/render/update.py``) as a value-filterable frame so OpenMS-Insight + components can value-filter (``filters={'proteinIndex': 'proteoform_index'}``) + instead of doing an ``iloc`` / per-scan pushdown by hand. + + Args: + protein_df: pandas/polars frame with columns ``index`` (proteoform index) + and ``Scan``. + scan_table_df: pandas/polars frame with columns ``index`` (deconv row + index) and ``Scan``. + + Returns: + Polars DataFrame with columns ``proteoform_index`` (int64), ``scan`` + (int64) and ``deconv_index`` (int64). Proteoforms whose ``Scan`` is null + or absent from ``scan_table_df`` are omitted (same as the map builder). + """ + scan_map = build_proteoform_scan_map( + _to_pandas(protein_df), _to_pandas(scan_table_df) + ) + if not scan_map: + return pl.DataFrame( + schema={ + "proteoform_index": pl.Int64, + "scan": pl.Int64, + "deconv_index": pl.Int64, + } + ) + rows = [ + {"proteoform_index": int(pid), "scan": int(v["scan"]), + "deconv_index": int(v["deconv_index"])} + for pid, v in scan_map.items() + ] + return pl.DataFrame(rows).sort("proteoform_index") + + +def _to_pandas(df): + """Accept a polars or pandas frame; return pandas (the map builder uses + pandas indexing semantics).""" + if isinstance(df, pl.DataFrame): + return df.to_pandas() + if isinstance(df, pl.LazyFrame): + return df.collect().to_pandas() + return df From d50d24b4026882226f152cc95061962b2ab3616c Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 23:35:58 +0000 Subject: [PATCH 03/10] Fix integration bugs in FLASHApp OpenMS-Insight viewers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit deconv.py (_explode_long_by_position): the long-format explode kept the full per-scan value lists on every exploded row and then gathered the scalar — O(rows x max_len) memory, which OOMs on real spectra (an 865k-row annotated frame with multi-thousand-length lists reached tens of GB). Pad each value list to the per-scan max length with nulls and zip-explode the id column and all value lists together, staying O(output rows) with identical output (parity tests unchanged). FLASHTnTViewerOI (_build_sequence_frame): sequence_data is a pickle-backed dict keyed by proteoform_index, not a tabular frame. Loading it as a LazyFrame raised AttributeError and left SequenceView blank for every TnT dataset. Load the raw dict and iterate it, slicing the displayed proteoform substring and its coverage together so they stay aligned. --- content/FLASHTnT/FLASHTnTViewerOI.py | 58 +++++++++++++++++----------- src/parse/deconv.py | 25 +++++++----- 2 files changed, 52 insertions(+), 31 deletions(-) diff --git a/content/FLASHTnT/FLASHTnTViewerOI.py b/content/FLASHTnT/FLASHTnTViewerOI.py index d2be1e8a..b9853bc4 100644 --- a/content/FLASHTnT/FLASHTnTViewerOI.py +++ b/content/FLASHTnT/FLASHTnTViewerOI.py @@ -247,37 +247,51 @@ def _build_sequence_frame( Columns emitted: ``proteoform_index`` (filter key), ``sequence`` (str), ``precursor_charge`` (=1, neutral/deconvolved peaks), ``coverage`` (list[f64]), ``maxCoverage`` (f64), ``fixed_modifications`` (list[str]).""" - store = _lazy(file_manager, experiment_id, "sequence_data") - if store is None: + # ``sequence_data`` is a pickle-backed store: a dict keyed by + # ``proteoform_index``, each value a dict with per-residue ``sequence`` / + # ``coverage`` lists (full protein), ``maxCoverage``, ``proteoform_start`` / + # ``proteoform_end`` and ``fixed_modifications``. It is NOT a tabular frame, + # so it is loaded as the raw object (``_pandas`` returns the unpickled dict) + # and iterated — loading it as a LazyFrame raises AttributeError and leaves + # SequenceView blank. + store = _pandas(file_manager, experiment_id, "sequence_data") + if not isinstance(store, dict) or not store: return None - schema = store.collect_schema().names() - want = ["proteoform_index", "sequence", "coverage", "maxCoverage", - "proteoform_start", "proteoform_end", "fixed_modifications"] - cols = [c for c in want if c in schema] - df = store.select(cols).collect() + proteoform_indices: List[int] = [] sequences: List[str] = [] - for row in df.iter_rows(named=True): - full = row.get("sequence") or [] - start = row.get("proteoform_start") - end = row.get("proteoform_end") + coverages: List[list] = [] + max_coverages: List[float] = [] + fixed_mods: List[list] = [] + for pid in sorted(store.keys()): + entry = store[pid] or {} + full = list(entry.get("sequence") or []) + cov = list(entry.get("coverage") or []) + start = entry.get("proteoform_start") + end = entry.get("proteoform_end") + # Slice the displayed proteoform substring AND its coverage together so the + # two stay aligned (the legacy SequenceView rendered the substring). A + # negative/absent bound means render the full protein. if start is None or end is None or start < 0 or end < 0: - sub = full + sub_seq, sub_cov = full, cov else: - sub = full[start:end + 1] - sequences.append("".join(sub)) + sub_seq, sub_cov = full[start:end + 1], cov[start:end + 1] + proteoform_indices.append(int(pid)) + sequences.append("".join(sub_seq)) + coverages.append([float(c) for c in sub_cov]) + mc = entry.get("maxCoverage") + max_coverages.append(float(mc) if mc is not None else 0.0) + fm = entry.get("fixed_modifications") or [] + fixed_mods.append([str(m) for m in fm]) out = pl.DataFrame({ - "proteoform_index": df["proteoform_index"], + "proteoform_index": proteoform_indices, "sequence": sequences, - "precursor_charge": [1] * df.height, + "precursor_charge": [1] * len(proteoform_indices), + "coverage": coverages, + "maxCoverage": max_coverages, + "fixed_modifications": fixed_mods, }) - if "coverage" in df.columns: - out = out.with_columns(df["coverage"].alias("coverage")) - if "maxCoverage" in df.columns: - out = out.with_columns(df["maxCoverage"].alias("maxCoverage")) - if "fixed_modifications" in df.columns: - out = out.with_columns(df["fixed_modifications"].alias("fixed_modifications")) return out.lazy() diff --git a/src/parse/deconv.py b/src/parse/deconv.py index 2153eb89..922a25d8 100644 --- a/src/parse/deconv.py +++ b/src/parse/deconv.py @@ -61,22 +61,29 @@ def _explode_long_by_position(indexed_lf, id_col, value_exprs): for _, expr in value_exprs[1:]: max_len = pl.max_horizontal(max_len, expr.list.len()) + # Pad every value list to the per-scan ``max_len`` with nulls — gathering by + # the position range with ``null_on_oob`` reproduces the legacy blank-tail + # cell — then zip-explode the id column and all value lists together so each + # output row is exactly one position. + # + # This stays O(total output rows). The earlier approach exploded only the id + # column while every row still carried the full per-scan value lists, then + # gathered the scalar — i.e. O(rows × max_len), which materialises the lists + # `max_len` times and OOMs on real spectra (e.g. an 865k-row annotated frame + # with multi-thousand-length lists ≈ tens of GB). Exploding the lists in + # lock-step avoids the duplication entirely. + positions = pl.int_ranges(0, max_len) lf = ( indexed_lf .select( [pl.col("index")] - + [expr.alias(name) for name, expr in value_exprs] - + [pl.int_ranges(0, max_len).alias(id_col)] + + [expr.list.gather(positions, null_on_oob=True).alias(name) + for name, expr in value_exprs] + + [positions.alias(id_col)] ) - .explode(id_col) + .explode([id_col] + out_names) # Empty scans explode to a single null-id row; drop so they contribute 0 rows. .filter(pl.col(id_col).is_not_null()) - # Gather each column's value at the row's position (null when the column is - # shorter than this position — the legacy `undefined` cell). - .with_columns( - [pl.col(name).list.get(pl.col(id_col), null_on_oob=True).alias(name) - for name in out_names] - ) .sort(["index", id_col]) ) return lf.select(["index", id_col] + out_names) From 62c6b00648c6c5f47f271d2ecdb27a0bcc7f8976 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Jun 2026 09:55:10 +0000 Subject: [PATCH 04/10] Build and install OpenMS-Insight in the Docker image The new FLASHApp viewers render via OpenMS-Insight, whose components are not yet in the published PyPI release. Add an oi-build node stage (x86 + arm) that clones the OpenMS-Insight branch, builds its Vue bundle, and stages it at openms_insight/js-component/dist; then install the package from source in the run-app stage so the bundle is baked into site-packages where the runtime bridge loads it. openms-insight is stripped from requirements.txt so the PyPI release is not pulled instead. Repo/branch are overridable via the OPENMS_INSIGHT_REPO/OPENMS_INSIGHT_BRANCH build args (default: the migration branch). CI builds the image unchanged via the build-arg defaults. --- Dockerfile | 42 +++++++++++++++++++++++++++++++++++++++--- Dockerfile.arm | 39 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0f72d953..35707167 100644 --- a/Dockerfile +++ b/Dockerfile @@ -104,9 +104,12 @@ RUN make -j4 pyopenms WORKDIR /openms-build/pyOpenMS RUN pip install dist/*.whl -# Install other dependencies (excluding pyopenms) -COPY requirements.txt ./requirements.txt -RUN grep -Ev '^pyopenms([=<>!~].*)?$' requirements.txt > requirements_cleaned.txt && mv requirements_cleaned.txt requirements.txt +# Install other dependencies (excluding pyopenms and openms-insight). +# openms-insight is installed from source in the run-app stage instead: the +# pinned PyPI release does not yet carry the new FLASHApp visualization +# components, so we build our branch (with its Vue bundle) below. +COPY requirements.txt ./requirements.txt +RUN grep -Ev '^(pyopenms|openms-insight)([=<>!~].*)?$' requirements.txt > requirements_cleaned.txt && mv requirements_cleaned.txt requirements.txt RUN pip install -r requirements.txt WORKDIR / @@ -144,6 +147,31 @@ WORKDIR /openms-streamlit-vue-component RUN npm install RUN npm run build +# Build the OpenMS-Insight Vue bundle and stage its source tree so the Python +# package can be installed (with the bundle baked in) in the run-app stage. +# Like js-build, kept after the slow OpenMS compile so component changes don't +# invalidate that cache. +FROM node:21 AS oi-build + +# OpenMS-Insight repo/branch providing the new visualization components. +# Defaults to the migration branch because those components are not yet in the +# published PyPI release. Override via --build-arg once it is merged/released. +ARG OPENMS_INSIGHT_REPO=https://github.com/t0mdavid-m/OpenMS-Insight.git +ARG OPENMS_INSIGHT_BRANCH=claude/flashapp-openms-visualizations-LVv66 + +# Bust the build cache whenever the branch head moves. +ADD https://api.github.com/repos/t0mdavid-m/OpenMS-Insight/git/refs/heads/$OPENMS_INSIGHT_BRANCH oi-version.json + +RUN git clone -b ${OPENMS_INSIGHT_BRANCH} --single-branch ${OPENMS_INSIGHT_REPO} /openms-insight +WORKDIR /openms-insight/js-component +RUN npm install +RUN npm run build +# The runtime bridge and the wheel packaging both expect the built bundle at +# openms_insight/js-component/dist; place it there so the `pip install` in the +# run-app stage bundles it into site-packages. +RUN mkdir -p /openms-insight/openms_insight/js-component \ + && cp -r /openms-insight/js-component/dist /openms-insight/openms_insight/js-component/dist + # Prepare and run streamlit app. FROM compile-openms AS run-app @@ -190,6 +218,14 @@ COPY presets.json /app/presets.json # Copy the pre-built Vue/JS component (built in the js-build stage above). COPY --from=js-build openms-streamlit-vue-component/dist /app/js-component/dist +# Install OpenMS-Insight (our branch build, with its freshly built Vue bundle) +# into the streamlit env. requirements.txt has openms-insight stripped, so this +# is the authoritative install and provides the new visualization components. +# The package ships its Vue bundle at openms_insight/js-component/dist (staged +# in the oi-build stage), which the runtime bridge loads in production mode. +COPY --from=oi-build /openms-insight /opt/openms-insight +RUN mamba run -n streamlit-env pip install /opt/openms-insight + # add cron job to the crontab RUN echo "0 3 * * * /root/miniforge3/envs/streamlit-env/bin/python /app/clean-up-workspaces.py >> /app/clean-up-workspaces.log 2>&1" | crontab - diff --git a/Dockerfile.arm b/Dockerfile.arm index 9fe055ec..a9e2f010 100644 --- a/Dockerfile.arm +++ b/Dockerfile.arm @@ -99,9 +99,12 @@ RUN make -j4 pyopenms WORKDIR /openms-build/pyOpenMS RUN pip install dist/*.whl -# Install other dependencies (excluding pyopenms) -COPY requirements.txt ./requirements.txt -RUN grep -Ev '^pyopenms([=<>!~].*)?$' requirements.txt > requirements_cleaned.txt && mv requirements_cleaned.txt requirements.txt +# Install other dependencies (excluding pyopenms and openms-insight). openms-insight +# is installed from source in the run-app stage: the pinned PyPI release does not +# yet carry the new FLASHApp visualization components, so we build our branch +# (with its Vue bundle) below. +COPY requirements.txt ./requirements.txt +RUN grep -Ev '^(pyopenms|openms-insight)([=<>!~].*)?$' requirements.txt > requirements_cleaned.txt && mv requirements_cleaned.txt requirements.txt RUN pip install -r requirements.txt WORKDIR / @@ -139,6 +142,28 @@ WORKDIR /openms-streamlit-vue-component RUN npm install RUN npm run build +# Build the OpenMS-Insight Vue bundle and stage its source tree so the Python +# package can be installed (with the bundle baked in) in the run-app stage. +FROM node:21 AS oi-build + +# OpenMS-Insight repo/branch providing the new visualization components. Defaults +# to the migration branch (its components are not yet in the PyPI release). +# Override via --build-arg once merged/released. +ARG OPENMS_INSIGHT_REPO=https://github.com/t0mdavid-m/OpenMS-Insight.git +ARG OPENMS_INSIGHT_BRANCH=claude/flashapp-openms-visualizations-LVv66 + +# Bust the build cache whenever the branch head moves. +ADD https://api.github.com/repos/t0mdavid-m/OpenMS-Insight/git/refs/heads/$OPENMS_INSIGHT_BRANCH oi-version.json + +RUN git clone -b ${OPENMS_INSIGHT_BRANCH} --single-branch ${OPENMS_INSIGHT_REPO} /openms-insight +WORKDIR /openms-insight/js-component +RUN npm install +RUN npm run build +# The runtime bridge and wheel packaging expect the bundle at +# openms_insight/js-component/dist; place it there so pip install bundles it. +RUN mkdir -p /openms-insight/openms_insight/js-component \ + && cp -r /openms-insight/js-component/dist /openms-insight/openms_insight/js-component/dist + # Prepare and run streamlit app. FROM compile-openms AS run-app @@ -171,6 +196,14 @@ COPY presets.json /app/presets.json # Copy the pre-built Vue/JS component (built in the js-build stage above). COPY --from=js-build openms-streamlit-vue-component/dist /app/js-component/dist +# Install OpenMS-Insight (our branch build, with its freshly built Vue bundle) +# into the streamlit env. requirements.txt has openms-insight stripped, so this +# is the authoritative install providing the new visualization components. The +# package ships its Vue bundle at openms_insight/js-component/dist (staged in the +# oi-build stage), which the runtime bridge loads in production mode. +COPY --from=oi-build /openms-insight /opt/openms-insight +RUN mamba run -n streamlit-env pip install /opt/openms-insight + # add cron job to the crontab RUN echo "0 3 * * * /root/miniforge3/envs/streamlit-env/bin/python /app/clean-up-workspaces.py >> /app/clean-up-workspaces.log 2>&1" | crontab - From da0db3f23387f5485932823084b85d7593dfca29 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Jun 2026 10:12:10 +0000 Subject: [PATCH 05/10] Address PR review: robust viewer fallback, quieter skips, tag lookup From the CodeRabbit review on #91: - Lazy-import the OpenMS-Insight viewer inside render_panel (FLASHDeconv and FLASHTnT) with a try/except fallback to the legacy grid, so an import-time failure of the OI stack no longer breaks the whole page before the use_openms_insight_viewer fallback can engage. - FLASHDeconvViewerOI: skip absent optional components silently instead of emitting a warning on every rerun (a builder returns None for legitimately absent optional frames). - FLASHTnTViewerOI: resolve the selected tag's masses by filtering tag_dfs on the chosen TagIndex instead of collecting the whole table into a lookup dict on every rerun (preserves trailing-comma parsing and null-dropping). Not applied: the suggestion to move page_setup() above the new helper definitions - those are plain function defs that do not execute at import, and page_setup()'s call site is unchanged from the file's existing structure (which already defines DEFAULT_LAYOUT and other helpers before page_setup()). --- content/FLASHDeconv/FLASHDeconvViewer.py | 33 ++++++++++------ content/FLASHDeconv/FLASHDeconvViewerOI.py | 4 +- content/FLASHTnT/FLASHTnTViewer.py | 33 ++++++++++------ content/FLASHTnT/FLASHTnTViewerOI.py | 46 +++++++++++----------- 4 files changed, 69 insertions(+), 47 deletions(-) diff --git a/content/FLASHDeconv/FLASHDeconvViewer.py b/content/FLASHDeconv/FLASHDeconvViewer.py index 36ff9c5e..55b4276c 100644 --- a/content/FLASHDeconv/FLASHDeconvViewer.py +++ b/content/FLASHDeconv/FLASHDeconvViewer.py @@ -6,9 +6,9 @@ from src.workflow.FileManager import FileManager # Legacy bespoke-grid render path (kept importable until OI integration is verified). from src.render.render import render_grid -# New OpenMS-Insight viewer (Stage B). Selected via the -# `use_openms_insight_viewer` settings flag (defaults True). -from content.FLASHDeconv.FLASHDeconvViewerOI import render_experiment_panel +# The OpenMS-Insight viewer (Stage B) is imported lazily inside render_panel (see +# below) so an import failure (e.g. a missing openms-insight install) falls back +# to the legacy grid instead of breaking the whole page. def _use_oi_viewer(): @@ -22,16 +22,27 @@ def render_panel(experiment_id, layout_info_per_exp, file_manager, identifier, """Render one experiment panel via the configured viewer. Routes to the new OpenMS-Insight viewer when enabled, else the legacy grid. + The OI viewer is imported lazily and guarded so an import failure falls back + to the legacy grid rather than breaking the page. """ if _use_oi_viewer(): - render_experiment_panel( - experiment_id, layout_info_per_exp, file_manager, panel_index - ) - else: - render_grid( - experiment_id, layout_info_per_exp, file_manager, - 'flashdeconv', identifier, grid_key - ) + try: + from content.FLASHDeconv.FLASHDeconvViewerOI import ( + render_experiment_panel, + ) + except Exception as exc: # noqa: BLE001 - OI viewer unavailable + st.warning( + f"OpenMS-Insight viewer unavailable ({exc}); using legacy grid." + ) + else: + render_experiment_panel( + experiment_id, layout_info_per_exp, file_manager, panel_index + ) + return + render_grid( + experiment_id, layout_info_per_exp, file_manager, + 'flashdeconv', identifier, grid_key + ) DEFAULT_LAYOUT = [['ms1_deconv_heat_map'], ['scan_table', 'mass_table'], ['anno_spectrum', 'deconv_spectrum'], ['3D_SN_plot']] diff --git a/content/FLASHDeconv/FLASHDeconvViewerOI.py b/content/FLASHDeconv/FLASHDeconvViewerOI.py index 5be82392..e27ab1cb 100644 --- a/content/FLASHDeconv/FLASHDeconvViewerOI.py +++ b/content/FLASHDeconv/FLASHDeconvViewerOI.py @@ -340,8 +340,10 @@ def render_experiment_panel( component = build_component( file_manager, experiment_id, cache_dir, comp_name ) + # A builder returns None when its optional backing frame is + # absent (e.g. no sequence submitted, or *_long not yet cached); + # skip silently rather than warning on every rerun. if component is None: - st.warning(f"No data for '{comp_name}'.") continue key = f"deconv_oi_{panel_index}_{row_index}_{col_index}_{comp_name}" component(key=key, state_manager=state_manager) diff --git a/content/FLASHTnT/FLASHTnTViewer.py b/content/FLASHTnT/FLASHTnTViewer.py index 327a72d8..0d3d7079 100644 --- a/content/FLASHTnT/FLASHTnTViewer.py +++ b/content/FLASHTnT/FLASHTnTViewer.py @@ -6,9 +6,9 @@ from src.workflow.FileManager import FileManager # Legacy bespoke-grid render path (kept importable until OI integration is verified). from src.render.render import render_grid -# New OpenMS-Insight viewer (Stage C). Selected via the -# `use_openms_insight_viewer` settings flag (defaults True). -from content.FLASHTnT.FLASHTnTViewerOI import render_experiment_panel +# The OpenMS-Insight viewer (Stage C) is imported lazily inside render_panel (see +# below) so an import failure (e.g. a missing openms-insight install) falls back +# to the legacy grid instead of breaking the whole page. def _use_oi_viewer(): @@ -22,16 +22,27 @@ def render_panel(experiment_id, layout_info_per_exp, file_manager, identifier, """Render one experiment panel via the configured viewer. Routes to the new OpenMS-Insight viewer when enabled, else the legacy grid. + The OI viewer is imported lazily and guarded so an import failure falls back + to the legacy grid rather than breaking the page. """ if _use_oi_viewer(): - render_experiment_panel( - experiment_id, layout_info_per_exp, file_manager, panel_index - ) - else: - render_grid( - experiment_id, layout_info_per_exp, file_manager, - 'flashtnt', identifier, grid_key - ) + try: + from content.FLASHTnT.FLASHTnTViewerOI import ( + render_experiment_panel, + ) + except Exception as exc: # noqa: BLE001 - OI viewer unavailable + st.warning( + f"OpenMS-Insight viewer unavailable ({exc}); using legacy grid." + ) + else: + render_experiment_panel( + experiment_id, layout_info_per_exp, file_manager, panel_index + ) + return + render_grid( + experiment_id, layout_info_per_exp, file_manager, + 'flashtnt', identifier, grid_key + ) DEFAULT_LAYOUT = [ diff --git a/content/FLASHTnT/FLASHTnTViewerOI.py b/content/FLASHTnT/FLASHTnTViewerOI.py index b9853bc4..a1069e97 100644 --- a/content/FLASHTnT/FLASHTnTViewerOI.py +++ b/content/FLASHTnT/FLASHTnTViewerOI.py @@ -192,16 +192,28 @@ def _build_tag_table(file_manager, experiment_id: str, cache_dir: str): ) -def _tag_mass_lookup(file_manager, experiment_id: str) -> dict: - """Map ``TagIndex`` -> list[float] of the tag's masses (parsed from the - comma-joined ``mzs`` string with its trailing comma). Used to resolve a tag - selection into the mass list the combined-spectrum tagger overlay matches.""" +def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None: + """Resolve the selected ``tagData`` (a ``TagIndex``) to its list of masses and + publish under ``tagMasses`` so the combined-spectrum LinePlot tagger overlay + can read it. Clears ``tagMasses`` when no tag is selected. + + Only the selected tag's row is collected (filtered by ``TagIndex``) instead + of building a full lookup over ``tag_dfs`` on every rerun. The tag ``mzs`` + are a comma-joined string with a trailing comma; parse and drop non-numeric + entries.""" + tag_index = state_manager.get_selection(TAG_KEY) + if tag_index is None: + state_manager.clear_selection(TAG_MASSES_KEY) + return + tags = _lazy(file_manager, experiment_id, "tag_dfs") if tags is None: - return {} - df = ( - tags.select(["TagIndex", "mzs"]) - .with_columns( + state_manager.clear_selection(TAG_MASSES_KEY) + return + + selected = ( + tags.filter(pl.col("TagIndex") == int(tag_index)) + .select( pl.col("mzs") .str.strip_chars(",") .str.split(",") @@ -210,22 +222,8 @@ def _tag_mass_lookup(file_manager, experiment_id: str) -> dict: ) .collect() ) - return { - int(ti): [m for m in masses if m is not None] - for ti, masses in zip(df["TagIndex"], df["tag_masses"].to_list()) - } - - -def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None: - """Resolve the selected ``tagData`` (a ``TagIndex``) to its list of masses and - publish under ``tagMasses`` so the combined-spectrum LinePlot tagger overlay - can read it. Clears ``tagMasses`` when no tag is selected.""" - tag_index = state_manager.get_selection(TAG_KEY) - if tag_index is None: - state_manager.clear_selection(TAG_MASSES_KEY) - return - lookup = _tag_mass_lookup(file_manager, experiment_id) - masses = lookup.get(int(tag_index)) + raw = selected["tag_masses"][0] if selected.height else None + masses = [m for m in raw if m is not None] if raw is not None else None if masses: state_manager.set_selection(TAG_MASSES_KEY, list(masses)) else: From 85d20d47bd2996c3e2c9c03e7bfdf53b6a1c96f6 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 08:49:17 +0000 Subject: [PATCH 06/10] Fix viewer parity: table columns, sequence view, tag-walk spectrum Address feature-parity gaps in the OpenMS-Insight viewers vs the legacy grid: - Tables (protein/tag in TnT, scan/mass in Deconv): pass curated column_definitions mirroring the legacy Tabulator*.vue configs (human- readable titles, only the relevant fields, 2-decimal formatting), instead of dumping every raw frame column with auto-generated names. - Sequence View "No data": _build_sequence_frame expected the pickled-dict sequence_data format (stale example cache), but the live parseTnT writes it as parquet (build_table -> parquet_sink). Handle BOTH: parquet frame (LazyFrame/DataFrame) and legacy dict, building the same coverage/fixed-mod frame either way. - Augmented Deconvolved Spectrum: _resolve_tag_masses now publishes the tag walk {masses, residues} (residues = reversed TagSequence aligned to the stored-order mass gaps, matching the legacy reversed-index rule) so the OI LinePlot renders the residue walk + auto-zoom, not just peak highlights. --- content/FLASHDeconv/FLASHDeconvViewerOI.py | 42 ++++++ content/FLASHTnT/FLASHTnTViewerOI.py | 155 +++++++++++++++++---- 2 files changed, 170 insertions(+), 27 deletions(-) diff --git a/content/FLASHDeconv/FLASHDeconvViewerOI.py b/content/FLASHDeconv/FLASHDeconvViewerOI.py index e27ab1cb..d6e9a563 100644 --- a/content/FLASHDeconv/FLASHDeconvViewerOI.py +++ b/content/FLASHDeconv/FLASHDeconvViewerOI.py @@ -54,6 +54,44 @@ SCAN_KEY = "scanIndex" MASS_KEY = "massIndex" +# Curated column definitions mirroring the LEGACY Vue tables (titles / order / +# field selection). The OI Table's ``_get_columns_to_select`` projects to ONLY the +# fields named here (plus index / interactivity / filter columns), so any internal +# frame column not listed is hidden -- the visual-parity goal. + +# TabulatorScanTable.vue columns -> scan_table fields. Legacy "Index" (id) maps to +# the frame's `index` (row position == scan index). +_SCAN_COLUMN_DEFINITIONS = [ + {"title": "Index", "field": "index", "sorter": "number"}, + {"title": "Scan Number", "field": "Scan", "sorter": "number"}, + {"title": "MS Level", "field": "MSLevel", "sorter": "number"}, + {"title": "Retention time", "field": "RT", "sorter": "number", + "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + {"title": "Precursor Mass", "field": "PrecursorMass", "sorter": "number", + "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + {"title": "#Masses", "field": "#Masses", "sorter": "number"}, +] + +# TabulatorMassTable.vue columns -> mass_table_long fields. Legacy "Index" (id) maps +# to the long frame's `mass_id` (0-based mass position within the scan). +_MASS_COLUMN_DEFINITIONS = [ + {"title": "Index", "field": "mass_id", "sorter": "number"}, + {"title": "Monoisotopic mass", "field": "MonoMass", "sorter": "number", + "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + {"title": "Sum intensity", "field": "SumIntensity", "sorter": "number", + "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + {"title": "Min charge", "field": "MinCharges", "sorter": "number"}, + {"title": "Max charge", "field": "MaxCharges", "sorter": "number"}, + {"title": "Min isotope", "field": "MinIsotopes", "sorter": "number"}, + {"title": "Max isotope", "field": "MaxIsotopes", "sorter": "number"}, + {"title": "Cosine score", "field": "CosineScore", "sorter": "number", + "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + {"title": "SNR", "field": "SNR", "sorter": "number", + "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + {"title": "QScore", "field": "QScore", "sorter": "number", + "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, +] + def _component_cache_dir(file_manager, experiment_id: str) -> str: """Directory under the workspace cache where OI component caches are written.""" @@ -118,6 +156,8 @@ def _build_scan_table(file_manager, experiment_id: str, cache_dir: str): data=data, interactivity={SCAN_KEY: "index"}, index_field="index", + column_definitions=_SCAN_COLUMN_DEFINITIONS, + go_to_fields=["index", "Scan"], title="Scan Table", cache_path=cache_dir, ) @@ -135,6 +175,8 @@ def _build_mass_table(file_manager, experiment_id: str, cache_dir: str): filters={SCAN_KEY: "index"}, interactivity={MASS_KEY: "mass_id"}, index_field="mass_id", + column_definitions=_MASS_COLUMN_DEFINITIONS, + go_to_fields=["mass_id"], title="Mass Table", cache_path=cache_dir, ) diff --git a/content/FLASHTnT/FLASHTnTViewerOI.py b/content/FLASHTnT/FLASHTnTViewerOI.py index a1069e97..f2aecff2 100644 --- a/content/FLASHTnT/FLASHTnTViewerOI.py +++ b/content/FLASHTnT/FLASHTnTViewerOI.py @@ -145,16 +145,54 @@ def _stamp_proteoform_index( # None when the underlying data frame is missing (component is silently skipped). # --------------------------------------------------------------------------- +# Curated column definitions mirroring the LEGACY Vue tables (titles / order / +# field selection). The OI Table's ``_get_columns_to_select`` projects to ONLY the +# fields named here (plus the index / interactivity / filter columns), so any +# internal frame column not listed is hidden -- exactly the parity goal. + +# TabulatorProteinTable.vue columns -> protein_dfs fields. +_PROTEIN_COLUMN_DEFINITIONS = [ + {"title": "Scan No.", "field": "Scan", "sorter": "number"}, + {"title": "Accession", "field": "accession", "sorter": "string"}, + {"title": "Description", "field": "description", "sorter": "string"}, + {"title": "Length", "field": "length", "sorter": "number"}, + {"title": "Mass", "field": "ProteoformMass", "sorter": "number"}, + {"title": "No. of Matched Fragments", "field": "MatchingFragments", "sorter": "number"}, + {"title": "No. of Modifications", "field": "ModCount", "sorter": "number"}, + {"title": "No. of Tags", "field": "TagCount", "sorter": "number"}, + {"title": "Score", "field": "Score", "sorter": "number"}, + {"title": "Q-Value (Proteoform Level)", "field": "ProteoformLevelQvalue", "sorter": "number"}, +] + +# TabulatorTagTable.vue columns -> tag_dfs fields. +_TAG_COLUMN_DEFINITIONS = [ + {"title": "Scan Number", "field": "Scan", "sorter": "number"}, + {"title": "Start Position", "field": "StartPos", "sorter": "number"}, + {"title": "End Position", "field": "EndPos", "sorter": "number"}, + {"title": "Sequence", "field": "TagSequence", "sorter": "string"}, + {"title": "Length", "field": "Length", "sorter": "number"}, + {"title": "Tag Score", "field": "Score", "sorter": "number"}, + {"title": "N mass", "field": "Nmass", "sorter": "number"}, + {"title": "C mass", "field": "Cmass", "sorter": "number"}, + {"title": "Δ mass", "field": "DeltaMass", "sorter": "number"}, +] + + def _build_protein_table(file_manager, experiment_id: str, cache_dir: str): data = _lazy(file_manager, experiment_id, "protein_dfs") if data is None: return None # Protein table: clicking a row sets proteinIndex to the row's `index`. + # Curated columns/titles match TabulatorProteinTable.vue (the `index` + # interactivity column is auto-included by the Table but stays hidden). return Table( cache_id=f"protein_table_{experiment_id}", data=data, interactivity={PROTEIN_KEY: "index"}, index_field="index", + column_definitions=_PROTEIN_COLUMN_DEFINITIONS, + initial_sort=[{"column": "Score", "dir": "desc"}], + go_to_fields=["Scan", "accession"], title="Protein Table", cache_path=cache_dir, ) @@ -187,20 +225,32 @@ def _build_tag_table(file_manager, experiment_id: str, cache_dir: str): filters={PROTEIN_KEY: "proteoform_index"}, interactivity={TAG_KEY: "TagIndex"}, index_field="TagIndex", + column_definitions=_TAG_COLUMN_DEFINITIONS, + initial_sort=[{"column": "Score", "dir": "desc"}], + go_to_fields=["Scan", "StartPos", "EndPos", "TagSequence"], title="Tag Table", cache_path=cache_dir, ) def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None: - """Resolve the selected ``tagData`` (a ``TagIndex``) to its list of masses and - publish under ``tagMasses`` so the combined-spectrum LinePlot tagger overlay - can read it. Clears ``tagMasses`` when no tag is selected. - - Only the selected tag's row is collected (filtered by ``TagIndex``) instead - of building a full lookup over ``tag_dfs`` on every rerun. The tag ``mzs`` - are a comma-joined string with a trailing comma; parse and drop non-numeric - entries.""" + """Resolve the selected ``tagData`` (a ``TagIndex``) to its masses + residue + walk and publish under ``tagMasses`` so the combined-spectrum LinePlot tagger + overlay renders the tag walk (residue letters between consecutive masses, with + the x-axis auto-zoomed to the tag span). Clears ``tagMasses`` when no tag is + selected. + + Only the selected tag's row is collected (filtered by ``TagIndex``). The tag + ``mzs`` are a comma-joined string (trailing comma); parse and drop non-numeric + entries, keeping the STORED order (ascending for C-term tags, descending for + N-term tags). ``TagSequence`` gives the residue letters; the legacy walks + consecutive stored masses labelling gap ``i`` with ``sequence[len-1-i]`` — + i.e. the REVERSED sequence aligns to the stored-order gaps regardless of + anchoring (verified against both an ascending C-term and a descending N-term + tag). Do NOT sort the masses: sorting breaks the alignment for descending + (N-term) tags. The published value is a dict + ``{"masses": [...], "residues": [...]}`` consumed by the OI LinePlot tag walk; + when no residues are available it carries only masses (highlight-only).""" tag_index = state_manager.get_selection(TAG_KEY) if tag_index is None: state_manager.clear_selection(TAG_MASSES_KEY) @@ -218,16 +268,32 @@ def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None .str.strip_chars(",") .str.split(",") .list.eval(pl.element().cast(pl.Float64, strict=False)) - .alias("tag_masses") + .alias("tag_masses"), + pl.col("TagSequence").alias("tag_sequence"), ) .collect() ) - raw = selected["tag_masses"][0] if selected.height else None - masses = [m for m in raw if m is not None] if raw is not None else None - if masses: - state_manager.set_selection(TAG_MASSES_KEY, list(masses)) - else: + if not selected.height: + state_manager.clear_selection(TAG_MASSES_KEY) + return + + raw = selected["tag_masses"][0] + # Keep STORED order (do not sort) so the reversed-sequence walk aligns for + # both ascending (C-term) and descending (N-term) tags. + masses = [m for m in raw if m is not None] if raw is not None else [] + if not masses: state_manager.clear_selection(TAG_MASSES_KEY) + return + + # Residue letter per consecutive-mass gap (len(masses) - 1 gaps): the legacy + # labels gap i with sequence[len-1-i], i.e. reversed(sequence) over the + # stored-order gaps. Trim to the number of gaps. + seq = selected["tag_sequence"][0] or "" + residues = list(reversed(str(seq)))[: max(len(masses) - 1, 0)] + + state_manager.set_selection( + TAG_MASSES_KEY, {"masses": list(masses), "residues": residues} + ) def _build_sequence_frame( @@ -244,16 +310,46 @@ def _build_sequence_frame( Columns emitted: ``proteoform_index`` (filter key), ``sequence`` (str), ``precursor_charge`` (=1, neutral/deconvolved peaks), ``coverage`` (list[f64]), - ``maxCoverage`` (f64), ``fixed_modifications`` (list[str]).""" - # ``sequence_data`` is a pickle-backed store: a dict keyed by - # ``proteoform_index``, each value a dict with per-residue ``sequence`` / - # ``coverage`` lists (full protein), ``maxCoverage``, ``proteoform_start`` / - # ``proteoform_end`` and ``fixed_modifications``. It is NOT a tabular frame, - # so it is loaded as the raw object (``_pandas`` returns the unpickled dict) - # and iterated — loading it as a LazyFrame raises AttributeError and leaves - # SequenceView blank. - store = _pandas(file_manager, experiment_id, "sequence_data") - if not isinstance(store, dict) or not store: + ``maxCoverage`` (f64), ``fixed_modifications`` (list[str]). + + ``sequence_data`` is loaded with ``use_polars=True`` and arrives in EITHER of + two formats which are handled identically here: + + * **parquet (current ``parseTnT``):** ``build_table(sequence_data)`` -> + ``parquet_sink`` writes one row per proteoform (schema in + ``src/render/sequence_data_store.py``). With ``use_polars=True`` the loader + returns a polars ``LazyFrame`` (the proteoform-index column is + ``proteoform_index``; ``sequence`` is the FULL protein ``list[str]`` and + ``coverage`` is the matching full-length ``list[f64]``). + * **pickle dict (legacy ``.pkl.gz`` example caches):** a dict keyed by the + proteoform index, each value a dict carrying the same ``sequence`` / + ``coverage`` / ``maxCoverage`` / ``proteoform_start`` / ``proteoform_end`` / + ``fixed_modifications`` keys. ``use_polars=True`` leaves the unpickled dict + untouched. + + In both formats ``sequence`` and ``coverage`` are full-length protein lists; we + slice the displayed proteoform substring AND its coverage together with the + 0-based ``proteoform_start`` / ``proteoform_end`` bounds (a negative/absent + bound means render the full protein).""" + if not file_manager.result_exists(experiment_id, "sequence_data"): + return None + store = file_manager.get_results( + experiment_id, ["sequence_data"], use_polars=True + )["sequence_data"] + + # Normalise either format into an iterable of per-proteoform row dicts. + if isinstance(store, pl.LazyFrame): + rows = store.collect().iter_rows(named=True) + elif isinstance(store, pl.DataFrame): + rows = store.iter_rows(named=True) + elif isinstance(store, dict): + if not store: + return None + rows = ( + {"proteoform_index": pid, **(store[pid] or {})} + for pid in sorted(store.keys()) + ) + else: return None proteoform_indices: List[int] = [] @@ -261,8 +357,10 @@ def _build_sequence_frame( coverages: List[list] = [] max_coverages: List[float] = [] fixed_mods: List[list] = [] - for pid in sorted(store.keys()): - entry = store[pid] or {} + for entry in rows: + pid = entry.get("proteoform_index") + if pid is None: + continue full = list(entry.get("sequence") or []) cov = list(entry.get("coverage") or []) start = entry.get("proteoform_start") @@ -275,13 +373,16 @@ def _build_sequence_frame( else: sub_seq, sub_cov = full[start:end + 1], cov[start:end + 1] proteoform_indices.append(int(pid)) - sequences.append("".join(sub_seq)) + sequences.append("".join(str(a) for a in sub_seq)) coverages.append([float(c) for c in sub_cov]) mc = entry.get("maxCoverage") max_coverages.append(float(mc) if mc is not None else 0.0) fm = entry.get("fixed_modifications") or [] fixed_mods.append([str(m) for m in fm]) + if not proteoform_indices: + return None + out = pl.DataFrame({ "proteoform_index": proteoform_indices, "sequence": sequences, From e3853047eb4bf59c65ab5af9b6bb6fff720788e4 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 10:54:25 +0000 Subject: [PATCH 07/10] Wire Sequence View residue click to filter the Tag Table Clicking a covered amino acid in the Sequence View now filters the Tag Table to the tags covering that residue, matching the legacy cross-link. - _build_sequence_frame emits a sequence_offset column so the SequenceView can report protein-absolute residue positions even when the displayed sequence is sliced to the proteoform substring (offset is 0 / unchanged for the bundled example proteoforms). - SequenceView interactivity gains {selectedAApos: "residue_position"} (keeps the peak_id mapping); the Tag Table gains range_filters={selectedAApos: ("StartPos", "EndPos")} alongside the existing proteoform filter, so it shows tags with StartPos <= pos <= EndPos. --- content/FLASHTnT/FLASHTnTViewerOI.py | 33 +++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/content/FLASHTnT/FLASHTnTViewerOI.py b/content/FLASHTnT/FLASHTnTViewerOI.py index f2aecff2..77d1f101 100644 --- a/content/FLASHTnT/FLASHTnTViewerOI.py +++ b/content/FLASHTnT/FLASHTnTViewerOI.py @@ -74,6 +74,15 @@ TAG_KEY = "tagData" TAG_MASSES_KEY = "tagMasses" MASS_KEY = "massIndex" +# Residue -> Tag-Table cross-link (legacy `selectionStore.selectedAApos`). +# Clicking a covered residue in the SequenceView sets this to the residue's +# PROTEIN-ABSOLUTE 0-based position; the Tag Table range-filters its rows to tags +# whose [StartPos, EndPos] span contains that position (StartPos <= pos <= EndPos), +# clearing on re-click (toggle). The SequenceView publishes protein-absolute +# coordinates via the per-proteoform `sequence_offset` carried in the sequence +# frame, so this matches StartPos/EndPos for ALL proteoforms (not just the +# bundled full-protein example). +AA_KEY = "selectedAApos" def _component_cache_dir(file_manager, experiment_id: str) -> str: @@ -223,6 +232,11 @@ def _build_tag_table(file_manager, experiment_id: str, cache_dir: str): cache_id=f"tag_table_{experiment_id}", data=tag_lf, filters={PROTEIN_KEY: "proteoform_index"}, + # Residue -> Tag-Table cross-link (legacy StartPos<=selectedAApos<=EndPos): + # when a covered residue is clicked in the SequenceView, AA_KEY holds its + # protein-absolute position and the tags are narrowed to those whose span + # contains it; cleared (no-op) when no residue is selected. + range_filters={AA_KEY: ("StartPos", "EndPos")}, interactivity={TAG_KEY: "TagIndex"}, index_field="TagIndex", column_definitions=_TAG_COLUMN_DEFINITIONS, @@ -357,6 +371,7 @@ def _build_sequence_frame( coverages: List[list] = [] max_coverages: List[float] = [] fixed_mods: List[list] = [] + sequence_offsets: List[int] = [] for entry in rows: pid = entry.get("proteoform_index") if pid is None: @@ -370,8 +385,18 @@ def _build_sequence_frame( # negative/absent bound means render the full protein. if start is None or end is None or start < 0 or end < 0: sub_seq, sub_cov = full, cov + offset = 0 else: sub_seq, sub_cov = full[start:end + 1], cov[start:end + 1] + offset = int(start) + # Coordinate decision (a): the displayed sequence is the proteoform + # substring starting at protein position `start`, so grid index i maps to + # protein-absolute position `start + i`. We carry `offset` (clamped to >= 0, + # mirroring the legacy AminoAcidCell.start getter) so the residue-click + # cross-link emits protein-absolute positions matching tag StartPos/EndPos. + # For the bundled example (start=0, end=-2 => full protein) offset is 0, so + # the emitted position equals the grid index exactly as before — the example + # render and coordinates are unchanged. proteoform_indices.append(int(pid)) sequences.append("".join(str(a) for a in sub_seq)) coverages.append([float(c) for c in sub_cov]) @@ -379,6 +404,7 @@ def _build_sequence_frame( max_coverages.append(float(mc) if mc is not None else 0.0) fm = entry.get("fixed_modifications") or [] fixed_mods.append([str(m) for m in fm]) + sequence_offsets.append(max(offset, 0)) if not proteoform_indices: return None @@ -390,6 +416,7 @@ def _build_sequence_frame( "coverage": coverages, "maxCoverage": max_coverages, "fixed_modifications": fixed_mods, + "sequence_offset": sequence_offsets, }) return out.lazy() @@ -424,7 +451,11 @@ def _build_sequence_view(file_manager, experiment_id: str, cache_dir: str): sequence_data=seq_frame, peaks_data=peaks, filters={PROTEIN_KEY: "proteoform_index"}, - interactivity={MASS_KEY: "peak_id"}, + # Two click sources: a fragment-table row click sets MASS_KEY to the + # matched peak's peak_id (combined-spectrum cross-link), and a SequenceView + # RESIDUE click sets AA_KEY to the clicked residue's protein-absolute + # position via the "residue_position" sentinel (Tag-Table range filter). + interactivity={MASS_KEY: "peak_id", AA_KEY: "residue_position"}, deconvolved=True, compute_fixed_mods=True, settings=settings, From f50036236f7ebb7fbe544d4edaa10e670d1dfbf8 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 12:57:10 +0000 Subject: [PATCH 08/10] Wire viewer parity: drill-down data, formatters, truncation, mods, resets Parse (src/parse/deconv.py): preserve per-deconv-peak signal arrays (signal_mzs/charges/intensities on combined_spectrum_long + deconv_spectrum_long) for the charge-state drill-down, and add Scan/PrecursorScan/PrecursorMass/ MonoMass to threedim_SN_plot for the 3D precursor-signal lookup. FLASHTnT viewer: protein/tag tables use dashNegativeOne (-1 -> "-"); protein table reproduces the default-ON "Best per spectrum" collapse (max Score per Scan) with a toggle; SequenceView now renders the FULL protein with truncation (proteoform_start/end + -2 sentinels), ambiguous mod_ranges, Precursor/Proteoform mass header, and tag-span highlight; _resolve_tag_masses publishes nTerminal + the tag span; combined spectrum gets the signal drill-down columns; dependent selections (tag/tagMasses/AApos/tagSpan/mass) clear on proteoform change. FLASHDeconv viewer: scan/mass tables use the plain 4-decimal "fixed" formatter (no thousands grouping); Scatter3D wired for the precursor-signal lookup; deconv spectrum gets the drill-down columns; submitted-sequence SequenceView bakes the selected fixed C/M mods into the sequence (correct fragment masses), enables the variable-mod menu, consumes the custom sequence (sequence_out), and emits the mass header; massIndex resets on scan change. --- content/FLASHDeconv/FLASHDeconvViewerOI.py | 132 +++++++- content/FLASHDeconv/deconv_sequence.py | 118 +++++++ content/FLASHTnT/FLASHTnTViewerOI.py | 338 ++++++++++++++++----- src/parse/deconv.py | 158 ++++++++-- tests/test_deconv_long_format.py | 129 +++++++- 5 files changed, 764 insertions(+), 111 deletions(-) create mode 100644 content/FLASHDeconv/deconv_sequence.py diff --git a/content/FLASHDeconv/FLASHDeconvViewerOI.py b/content/FLASHDeconv/FLASHDeconvViewerOI.py index d6e9a563..df7eb9d4 100644 --- a/content/FLASHDeconv/FLASHDeconvViewerOI.py +++ b/content/FLASHDeconv/FLASHDeconvViewerOI.py @@ -45,6 +45,11 @@ Table, ) +from content.FLASHDeconv.deconv_sequence import ( + bake_fixed_modifications, + theoretical_mass, +) + # Map the layout COMPONENT_NAMES (FLASHDeconvLayoutManager) to a builder. Every # builder returns a *callable* OpenMS-Insight component already wired with the # shared filters/interactivity identifiers. The identifiers below are the FLASHApp @@ -53,6 +58,10 @@ SCAN_KEY = "scanIndex" MASS_KEY = "massIndex" +# Receives the user-entered sequence from the SequenceView "Change sequence" +# dialog (Vue `sequence_out` interactivity sentinel). Mirrors the legacy +# `sequenceOut` selection consumed by src/render/update.py:get_sequence. +SEQ_OUT_KEY = "sequenceOut" # Curated column definitions mirroring the LEGACY Vue tables (titles / order / # field selection). The OI Table's ``_get_columns_to_select`` projects to ONLY the @@ -66,9 +75,9 @@ {"title": "Scan Number", "field": "Scan", "sorter": "number"}, {"title": "MS Level", "field": "MSLevel", "sorter": "number"}, {"title": "Retention time", "field": "RT", "sorter": "number", - "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + "formatter": "fixed", "formatterParams": {"precision": 4}}, {"title": "Precursor Mass", "field": "PrecursorMass", "sorter": "number", - "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + "formatter": "fixed", "formatterParams": {"precision": 4}}, {"title": "#Masses", "field": "#Masses", "sorter": "number"}, ] @@ -77,19 +86,19 @@ _MASS_COLUMN_DEFINITIONS = [ {"title": "Index", "field": "mass_id", "sorter": "number"}, {"title": "Monoisotopic mass", "field": "MonoMass", "sorter": "number", - "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + "formatter": "fixed", "formatterParams": {"precision": 4}}, {"title": "Sum intensity", "field": "SumIntensity", "sorter": "number", - "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + "formatter": "fixed", "formatterParams": {"precision": 4}}, {"title": "Min charge", "field": "MinCharges", "sorter": "number"}, {"title": "Max charge", "field": "MaxCharges", "sorter": "number"}, {"title": "Min isotope", "field": "MinIsotopes", "sorter": "number"}, {"title": "Max isotope", "field": "MaxIsotopes", "sorter": "number"}, {"title": "Cosine score", "field": "CosineScore", "sorter": "number", - "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + "formatter": "fixed", "formatterParams": {"precision": 4}}, {"title": "SNR", "field": "SNR", "sorter": "number", - "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + "formatter": "fixed", "formatterParams": {"precision": 4}}, {"title": "QScore", "field": "QScore", "sorter": "number", - "formatter": "money", "formatterParams": {"precision": 2, "symbol": ""}}, + "formatter": "fixed", "formatterParams": {"precision": 4}}, ] @@ -187,6 +196,8 @@ def _build_deconv_spectrum(file_manager, experiment_id: str, cache_dir: str): if data is None: return None # Deconvolved spectrum: filtered by scan; clicking a peak sets massIndex. + # The per-row signal_* list columns (emitted on deconv_spectrum_long by + # src/parse/deconv.py) drive the per-mass charge-state drill-down sub-view. return LinePlot( cache_id=f"deconv_spectrum_{experiment_id}", data=data, @@ -194,6 +205,9 @@ def _build_deconv_spectrum(file_manager, experiment_id: str, cache_dir: str): interactivity={MASS_KEY: "peak_id"}, x_column="MonoMass", y_column="SumIntensity", + signal_mz_column="signal_mzs", + signal_charge_column="signal_charges", + signal_intensity_column="signal_intensities", title="Deconvolved Spectrum", x_label="Monoisotopic Mass", y_label="Intensity", @@ -220,6 +234,14 @@ def _build_anno_spectrum(file_manager, experiment_id: str, cache_dir: str): def _build_combined_spectrum(file_manager, experiment_id: str, cache_dir: str): + # DEAD CODE in the OI Deconv viewer: the FLASHDeconv layout (see + # FLASHDeconvLayoutManager.COMPONENT_NAMES) exposes no "combined_spectrum" / + # augmented panel, and this builder is not registered in COMPONENT_BUILDERS, + # so it is never invoked. The "Augmented Deconvolved Spectrum" only exists in + # the LEGACY grid path (src/render/initialize.py). The real Deconv spectrum the + # OI layout renders is `deconv_spectrum` (_build_deconv_spectrum), which now + # carries the signal_* charge drill-down columns. Kept here for reference until + # an augmented panel is (if ever) added to the Deconv layout. primary = _lazy(file_manager, experiment_id, "combined_spectrum_long") if primary is None: return None @@ -267,6 +289,14 @@ def _build_scatter3d(file_manager, experiment_id: str, cache_dir: str): scan_filter="index", signal_column="SignalPeaks", noisy_column="NoisyPeaks", + # MS2 precursor-signal lookup: locate the precursor scan's row + # (Scan == PrecursorScan) and the index into its MonoMass array whose + # value matches PrecursorMass. All four columns are emitted on + # threedim_SN_plot by src/parse/deconv.py. + scan_column="Scan", + precursor_scan_column="PrecursorScan", + precursor_mass_column="PrecursorMass", + mono_mass_column="MonoMass", title="Precursor Signals", cache_path=cache_dir, ) @@ -300,15 +330,40 @@ def _get_sequence(file_manager): ) -def _build_sequence_view(file_manager, experiment_id: str, cache_dir: str): +def _build_sequence_view( + file_manager, experiment_id: str, cache_dir: str, state_manager=None +): seq = _get_sequence(file_manager) if seq is None: return None - sequence_string, _fix_c, _fix_m = seq + submitted_sequence, fix_c, fix_m = seq + + # Prefer a sequence the user entered via the SequenceView "Change sequence" + # dialog (Vue emits it into the `sequenceOut` selection through the + # `sequence_out` interactivity sentinel). Mirrors legacy + # src/render/update.py:get_sequence which prefers `sequenceOut`. The + # user-entered sequence is taken verbatim (no fixed-mod baking, matching the + # legacy path which returns it with no C/M mods). + user_sequence = None + if state_manager is not None: + candidate = state_manager.get_selection(SEQ_OUT_KEY) + if isinstance(candidate, str) and len(candidate) > 0: + user_sequence = candidate + + if user_sequence is not None: + sequence_string = user_sequence + else: + # Bake the selected C/M fixed modifications into the sequence string so + # the theoretical fragment masses (computed by SequenceView via pyOpenMS + # from the literal string) reflect them -- parity with the legacy + # setFixedModification, which applied the mods BEFORE fragment-mass + # calculation. (compute_fixed_mods only marks residue types; it does NOT + # shift masses, so baking is required.) + sequence_string = bake_fixed_modifications(submitted_sequence, fix_c, fix_m) + # Deconv peaks are neutral masses (deconvolved=True). Wire the deconv long # spectrum as the peaks_data (renamed to the SequenceView schema: peak_id, - # mass, intensity), filtered by the selected scan. C/M fixed mods are computed - # from the sequence (compute_fixed_mods=True) for Deconv parity. + # mass, intensity), filtered by the selected scan. peaks = _lazy(file_manager, experiment_id, "deconv_spectrum_long") if peaks is None: return None @@ -318,14 +373,36 @@ def _build_sequence_view(file_manager, experiment_id: str, cache_dir: str): pl.col("MonoMass").alias("mass"), pl.col("SumIntensity").alias("intensity"), ) + + # Pass the sequence as a single-row frame so we can attach the optional + # `computed_mass` column (the baked sequence's monoisotopic mass) for the + # SequenceView mass header. Falls back to a plain string when pyOpenMS is + # unavailable (theoretical_mass returns None) so the column is simply omitted. + seq_mass = theoretical_mass(sequence_string) + if seq_mass is not None: + sequence_data = pl.LazyFrame( + { + "sequence": [sequence_string], + "precursor_charge": [1], + "computed_mass": [seq_mass], + } + ) + else: + sequence_data = sequence_string + return SequenceView( cache_id=f"sequence_view_{experiment_id}", - sequence_data=sequence_string, + sequence_data=sequence_data, peaks_data=peaks, filters={SCAN_KEY: "index"}, - interactivity={MASS_KEY: "peak_id"}, + # Click a fragment-table row -> set massIndex to the matched peak_id. + # The "Change sequence" dialog -> set sequenceOut to the entered sequence. + interactivity={MASS_KEY: "peak_id", SEQ_OUT_KEY: "sequence_out"}, deconvolved=True, compute_fixed_mods=True, + # Enable the variable / custom modification context menu on this + # submitted-sequence path (TnT path keeps it disabled). + disable_variable_modifications=False, title="Sequence View", cache_path=cache_dir, ) @@ -347,13 +424,23 @@ def _build_sequence_view(file_manager, experiment_id: str, cache_dir: str): "mass_table": _build_mass_table, "3D_SN_plot": _build_scatter3d, "fdr_plot": _build_fdr_plot, - "sequence_view": _build_sequence_view, + # sequence_view is built separately (needs the panel StateManager to consume + # the `sequenceOut` selection); see build_component. # internal_fragment_map: deferred (component disabled in the legacy path too). } -def build_component(file_manager, experiment_id: str, cache_dir: str, comp_name: str): +def build_component( + file_manager, experiment_id: str, cache_dir: str, comp_name: str, + state_manager=None, +): """Instantiate the OpenMS-Insight component for a layout cell, or None.""" + if comp_name == "sequence_view": + # The SequenceView builder consumes the user-entered sequence from the + # panel StateManager (`sequenceOut`); the other builders are stateless. + return _build_sequence_view( + file_manager, experiment_id, cache_dir, state_manager=state_manager + ) builder = COMPONENT_BUILDERS.get(comp_name) if builder is None: return None @@ -375,12 +462,25 @@ def render_experiment_panel( state_manager = StateManager(session_key=session_key) cache_dir = _component_cache_dir(file_manager, experiment_id) + # When the selected scan changes, clear the mass selection so the mass table / + # 3D plot / spectrum highlight do not keep a stale mass from the prior scan + # (parity with TabulatorScanTable.vue:85-95, which clears the mass selection on + # a fresh scan-row click). We track the last-seen scanIndex per panel via a + # dedicated session_state key so the reset triggers once per change. + scan_seen_key = f"{session_key}__last_scan_index" + current_scan = state_manager.get_selection(SCAN_KEY) + last_scan = st.session_state.get(scan_seen_key) + if current_scan != last_scan: + state_manager.clear_selection(MASS_KEY) + st.session_state[scan_seen_key] = current_scan + for row_index, row in enumerate(layout_info_per_exp): columns = st.columns(len(row)) for col, (col_index, comp_name) in zip(columns, enumerate(row)): with col: component = build_component( - file_manager, experiment_id, cache_dir, comp_name + file_manager, experiment_id, cache_dir, comp_name, + state_manager=state_manager, ) # A builder returns None when its optional backing frame is # absent (e.g. no sequence submitted, or *_long not yet cached); diff --git a/content/FLASHDeconv/deconv_sequence.py b/content/FLASHDeconv/deconv_sequence.py new file mode 100644 index 00000000..87c2fb77 --- /dev/null +++ b/content/FLASHDeconv/deconv_sequence.py @@ -0,0 +1,118 @@ +"""Helpers for the FLASHDeconv OpenMS-Insight SequenceView path. + +The FLASHDeconv "submitted sequence" path lets the user pick a fixed +modification on cysteine and/or methionine (``fixed_mod_cysteine`` / +``fixed_mod_methionine`` in ``src/render/sequence.py``). The legacy renderer +applied those via ``setFixedModification`` *before* computing theoretical +fragment masses, so the masses reflected the mods. + +The OpenMS-Insight ``SequenceView`` computes theoretical fragment masses from +the literal sequence string (``calculate_fragment_masses_pyopenms``), and its +``compute_fixed_mods`` flag only *marks* which residue types carry a mod (for +display) -- it does NOT shift the fragment masses. To get parity we therefore +BAKE the selected fixed mods into the sequence string (e.g. +``C(Carbamidomethyl)``) so pyOpenMS includes the mass shift in every fragment. + +Mapping the FLASHApp option label (e.g. ``'Carbamidomethyl (+57)'``) to an +OpenMS modification name is done by mass, mirroring ``setFixedModification``'s +``ModificationsDB().getBestModificationByDiffMonoMass`` lookup, so the baked +name is one ``AASequence.fromString`` accepts. +""" + +from __future__ import annotations + +from typing import Optional + +# Mass shifts for the selectable fixed modifications, mirroring +# ``src/render/sequence.py`` (``fixed_mod_cysteine`` / ``fixed_mod_methionine``). +# Duplicated here (rather than imported) so this helper does not pull in +# ``src/render/sequence.py``'s top-level ``pyopenms`` import at module load: that +# keeps the helper importable/testable when pyOpenMS is absent (the mass-based +# name resolution and theoretical-mass calc degrade gracefully below). +fixed_mod_cysteine = { + "No modification": 0, + "Carbamidomethyl (+57)": 57.021464, + "Carboxymethyl (+58)": 58.005479, + "Xlink:Disulfide (-1 per C)": -1.007825, +} +fixed_mod_methionine = { + "No modification": 0, + "L-methionine sulfoxide (+16)": 15.994915, + "L-methionine sulfone (+32)": 31.989829, +} + + +def _resolve_mod_name(diff_mass: float, residue: str) -> Optional[str]: + """Resolve an OpenMS modification id for a mass shift on ``residue``. + + Mirrors ``setFixedModification`` (``getBestModificationByDiffMonoMass``). + Returns None if pyOpenMS is unavailable or no modification matches. + """ + if diff_mass == 0: + return None + try: + from pyopenms import ModificationsDB + except Exception: + return None + try: + mod = ModificationsDB().getBestModificationByDiffMonoMass( + diff_mass, 0.001, residue, 0 + ) + except Exception: + return None + if mod is None: + return None + try: + name = mod.getId() + except Exception: + return None + return name or None + + +def bake_fixed_modifications( + sequence: str, fix_c: Optional[str], fix_m: Optional[str] +) -> str: + """Return ``sequence`` with the chosen C/M fixed mods baked in as OpenMS mods. + + ``fix_c`` / ``fix_m`` are FLASHApp option labels (keys of + ``fixed_mod_cysteine`` / ``fixed_mod_methionine``); falsy / 'No modification' + leave that residue untouched. Unknown labels or a missing pyOpenMS leave the + sequence unchanged (graceful degradation; the static string still renders). + """ + if not sequence: + return sequence + + c_name = None + if fix_c and fix_c in fixed_mod_cysteine: + c_name = _resolve_mod_name(fixed_mod_cysteine[fix_c], "C") + m_name = None + if fix_m and fix_m in fixed_mod_methionine: + m_name = _resolve_mod_name(fixed_mod_methionine[fix_m], "M") + + if c_name is None and m_name is None: + return sequence + + out = [] + for aa in sequence: + out.append(aa) + if aa == "C" and c_name is not None: + out.append(f"({c_name})") + elif aa == "M" and m_name is not None: + out.append(f"({m_name})") + return "".join(out) + + +def theoretical_mass(sequence: str) -> Optional[float]: + """Monoisotopic mass of the (possibly modified) sequence, or None. + + Used to populate the SequenceView mass header (``computed_mass``). Returns + None when pyOpenMS is unavailable so the caller simply omits the column. + """ + if not sequence: + return None + try: + from pyopenms import AASequence + + return AASequence.fromString(sequence).getMonoWeight() + except Exception: + return None diff --git a/content/FLASHTnT/FLASHTnTViewerOI.py b/content/FLASHTnT/FLASHTnTViewerOI.py index 77d1f101..bd224773 100644 --- a/content/FLASHTnT/FLASHTnTViewerOI.py +++ b/content/FLASHTnT/FLASHTnTViewerOI.py @@ -78,11 +78,16 @@ # Clicking a covered residue in the SequenceView sets this to the residue's # PROTEIN-ABSOLUTE 0-based position; the Tag Table range-filters its rows to tags # whose [StartPos, EndPos] span contains that position (StartPos <= pos <= EndPos), -# clearing on re-click (toggle). The SequenceView publishes protein-absolute -# coordinates via the per-proteoform `sequence_offset` carried in the sequence -# frame, so this matches StartPos/EndPos for ALL proteoforms (not just the -# bundled full-protein example). +# clearing on re-click (toggle). The SequenceView now renders the FULL protein +# sequence, so the residue grid index IS the protein-absolute position +# (`sequence_offset` == 0) and the emitted coordinate matches tag StartPos/EndPos +# for ALL proteoforms directly. AA_KEY = "selectedAApos" +# Tag-span highlight on the SequenceView (legacy `selectedTag.{startPos,endPos}`). +# Published as {"start": StartPos, "end": EndPos, "nTerminal": bool} (protein- +# absolute indices) and consumed by the SequenceView `"tag_span"` interactivity +# sentinel, which reads this selection value to bracket-highlight the tag span. +TAG_SPAN_KEY = "tagSpan" def _component_cache_dir(file_manager, experiment_id: str) -> str: @@ -165,12 +170,20 @@ def _stamp_proteoform_index( {"title": "Accession", "field": "accession", "sorter": "string"}, {"title": "Description", "field": "description", "sorter": "string"}, {"title": "Length", "field": "length", "sorter": "number"}, - {"title": "Mass", "field": "ProteoformMass", "sorter": "number"}, + # Legacy TabulatorProteinTable.vue renders the `-1` sentinel as "-" (the raw + # value otherwise). The OI `dashNegativeOne` formatter reproduces the sentinel + # rule; precision 4 matches the app-wide `toFixedFormatter()` default (4 dp, + # used for every other numeric mass column, e.g. MonoMass in the mass table). + {"title": "Mass", "field": "ProteoformMass", "sorter": "number", + "formatter": "dashNegativeOne", "formatterParams": {"precision": 4}}, {"title": "No. of Matched Fragments", "field": "MatchingFragments", "sorter": "number"}, {"title": "No. of Modifications", "field": "ModCount", "sorter": "number"}, {"title": "No. of Tags", "field": "TagCount", "sorter": "number"}, {"title": "Score", "field": "Score", "sorter": "number"}, - {"title": "Q-Value (Proteoform Level)", "field": "ProteoformLevelQvalue", "sorter": "number"}, + # Q-Value also uses the `-1 -> "-"` sentinel rule (legacy formatter); 4 dp + # matches the app-wide default decimal precision. + {"title": "Q-Value (Proteoform Level)", "field": "ProteoformLevelQvalue", "sorter": "number", + "formatter": "dashNegativeOne", "formatterParams": {"precision": 4}}, ] # TabulatorTagTable.vue columns -> tag_dfs fields. @@ -181,21 +194,62 @@ def _stamp_proteoform_index( {"title": "Sequence", "field": "TagSequence", "sorter": "string"}, {"title": "Length", "field": "Length", "sorter": "number"}, {"title": "Tag Score", "field": "Score", "sorter": "number"}, - {"title": "N mass", "field": "Nmass", "sorter": "number"}, - {"title": "C mass", "field": "Cmass", "sorter": "number"}, + # N/C mass use the legacy `-1 -> "-"` sentinel rule (TabulatorTagTable.vue + # ~72-83); precision 4 matches the app-wide mass decimal default. + {"title": "N mass", "field": "Nmass", "sorter": "number", + "formatter": "dashNegativeOne", "formatterParams": {"precision": 4}}, + {"title": "C mass", "field": "Cmass", "sorter": "number", + "formatter": "dashNegativeOne", "formatterParams": {"precision": 4}}, {"title": "Δ mass", "field": "DeltaMass", "sorter": "number"}, ] -def _build_protein_table(file_manager, experiment_id: str, cache_dir: str): +def _filter_best_per_spectrum(protein_lf: pl.LazyFrame) -> pl.LazyFrame: + """Collapse the protein frame to the highest-``Score`` proteoform per ``Scan``. + + Reproduces the legacy default-ON "Best per spectrum" toggle + (TabulatorProteinTable.vue ~57-58, 116-198): keep, per ``Scan``, only the row + with the maximum ``Score``; ties keep the first-seen row (lowest ``index``). + Rows without a numeric ``Scan`` pass through unchanged (legacy passthrough). + """ + # Rank within each Scan by descending Score, tie-broken by ascending index so a + # deterministic single survivor is kept (mirrors the legacy first-seen tie rule + # once the frame is read in index order). + ranked = protein_lf.with_columns( + pl.col("Score") + .rank(method="ordinal", descending=True) + .over("Scan") + .alias("_score_rank") + ) + # Keep the best row per Scan; rows with a null Scan are passed through (their + # rank within the null group is irrelevant — keep them all, matching legacy). + kept = ranked.filter( + (pl.col("_score_rank") == 1) | pl.col("Scan").is_null() + ).drop("_score_rank") + return kept + + +def _build_protein_table( + file_manager, experiment_id: str, cache_dir: str, + best_per_spectrum: bool = True, +): data = _lazy(file_manager, experiment_id, "protein_dfs") if data is None: return None + # Best-per-spectrum (default ON): pre-filter to the max-Score proteoform per + # Scan BEFORE building the Table so the displayed rows / default-selected best + # hit / pagination all operate on the collapsed set (legacy parity). The + # checkbox in render_experiment_panel toggles this off to show every hit. + if best_per_spectrum: + data = _filter_best_per_spectrum(data) + # The cache_id encodes the toggle so the ON / OFF frames get distinct caches + # (the Table caches its preprocessed parquet by cache_id). + suffix = "best" if best_per_spectrum else "all" # Protein table: clicking a row sets proteinIndex to the row's `index`. # Curated columns/titles match TabulatorProteinTable.vue (the `index` # interactivity column is auto-included by the Table but stays hidden). return Table( - cache_id=f"protein_table_{experiment_id}", + cache_id=f"protein_table_{experiment_id}_{suffix}", data=data, interactivity={PROTEIN_KEY: "index"}, index_field="index", @@ -265,14 +319,18 @@ def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None (N-term) tags. The published value is a dict ``{"masses": [...], "residues": [...]}`` consumed by the OI LinePlot tag walk; when no residues are available it carries only masses (highlight-only).""" + def _clear_all() -> None: + state_manager.clear_selection(TAG_MASSES_KEY) + state_manager.clear_selection(TAG_SPAN_KEY) + tag_index = state_manager.get_selection(TAG_KEY) if tag_index is None: - state_manager.clear_selection(TAG_MASSES_KEY) + _clear_all() return tags = _lazy(file_manager, experiment_id, "tag_dfs") if tags is None: - state_manager.clear_selection(TAG_MASSES_KEY) + _clear_all() return selected = ( @@ -284,11 +342,15 @@ def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None .list.eval(pl.element().cast(pl.Float64, strict=False)) .alias("tag_masses"), pl.col("TagSequence").alias("tag_sequence"), + # Anchoring + span (legacy TabulatorTagTable.vue:142-173). + pl.col("StartPos").alias("start_pos"), + pl.col("EndPos").alias("end_pos"), + pl.col("Nmass").alias("n_mass"), ) .collect() ) if not selected.height: - state_manager.clear_selection(TAG_MASSES_KEY) + _clear_all() return raw = selected["tag_masses"][0] @@ -296,7 +358,7 @@ def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None # both ascending (C-term) and descending (N-term) tags. masses = [m for m in raw if m is not None] if raw is not None else [] if not masses: - state_manager.clear_selection(TAG_MASSES_KEY) + _clear_all() return # Residue letter per consecutive-mass gap (len(masses) - 1 gaps): the legacy @@ -305,10 +367,76 @@ def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None seq = selected["tag_sequence"][0] or "" residues = list(reversed(str(seq)))[: max(len(masses) - 1, 0)] + # Terminal anchoring (legacy `nTerminal = (Nmass == -1)`): an N-terminal tag is + # one whose N-mass is the `-1` sentinel. Forwarded into the tag-walk so the + # LinePlot honors the requested direction. + n_mass = selected["n_mass"][0] + n_terminal = (n_mass is not None) and (float(n_mass) == -1.0) + state_manager.set_selection( - TAG_MASSES_KEY, {"masses": list(masses), "residues": residues} + TAG_MASSES_KEY, + {"masses": list(masses), "residues": residues, "nTerminal": n_terminal}, ) + # Tag-span highlight on the SequenceView. StartPos/EndPos are protein-absolute + # (matching the full-protein residue grid), so they bracket the tag directly. + start_pos = selected["start_pos"][0] + end_pos = selected["end_pos"][0] + if start_pos is not None and end_pos is not None: + state_manager.set_selection( + TAG_SPAN_KEY, + {"start": int(start_pos), "end": int(end_pos), "nTerminal": n_terminal}, + ) + else: + state_manager.clear_selection(TAG_SPAN_KEY) + + +def _normalize_mod_ranges(raw) -> list: + """Normalize the cache ``modifications`` field into the SequenceView + ``mod_ranges`` shape: a list of ``{start, end, mass_diff, labels}`` dicts. + + The ``modifications`` field is a ``list[struct{start,end,mass_diff,labels}]`` + (sequence_data_store.SCHEMA) carrying ambiguous/spanning modification ranges + (DISTINCT from per-residue fixed mods). Entries missing start/end are skipped; + indices are protein-absolute (the SequenceView renders the full protein).""" + if raw is None: + return [] + out = [] + for item in raw: + if not isinstance(item, dict): + continue + if item.get("start") is None or item.get("end") is None: + continue + md = item.get("mass_diff") + labels = item.get("labels") + out.append({ + "start": int(item["start"]), + "end": int(item["end"]), + "mass_diff": float(md) if md is not None else 0.0, + "labels": "" if labels is None else str(labels), + }) + return out + + +def _precursor_mass_lookup(file_manager, experiment_id: str) -> dict: + """``proteoform_index -> PrecursorMass`` from ``protein_dfs`` (or ``{}``). + + The observed precursor mass is not stored in ``sequence_data``; the protein + frame carries it per proteoform (``index`` == proteoform_index). Used for the + SequenceView mass header ``precursor_mass`` column.""" + protein = _lazy(file_manager, experiment_id, "protein_dfs") + if protein is None: + return {} + schema = protein.collect_schema().names() + if "PrecursorMass" not in schema or "index" not in schema: + return {} + df = protein.select(["index", "PrecursorMass"]).collect() + return { + int(i): float(m) + for i, m in zip(df["index"].to_list(), df["PrecursorMass"].to_list()) + if i is not None and m is not None + } + def _build_sequence_frame( file_manager, experiment_id: str @@ -316,35 +444,35 @@ def _build_sequence_frame( """Build the SequenceView-ready per-proteoform sequence frame. Source: the per-proteoform ``sequence_data`` store (keyed by - ``proteoform_index``, carrying the per-residue ``coverage`` / ``maxCoverage`` - of the DISPLAYED proteoform substring and the full-protein ``sequence`` list + - ``proteoform_start``/``proteoform_end``). We reconstruct the displayed - proteoform sequence STRING (the substring the legacy SequenceView rendered) - and attach coverage so OpenMS-Insight SequenceView can shade residues. - - Columns emitted: ``proteoform_index`` (filter key), ``sequence`` (str), - ``precursor_charge`` (=1, neutral/deconvolved peaks), ``coverage`` (list[f64]), - ``maxCoverage`` (f64), ``fixed_modifications`` (list[str]). + ``proteoform_index``). It carries the FULL protein ``sequence`` list, the + matching full-length per-residue ``coverage`` / ``maxCoverage``, the + determined-region bounds ``proteoform_start`` / ``proteoform_end`` (with the + ``-2`` sentinel = that terminus undetermined / open), the observed + ``computed_mass`` and the ambiguous ``modifications`` ranges. + + We emit the FULL protein sequence (NOT pre-sliced) so the OpenMS-Insight + SequenceView Vue renders the truncated N-/C-flanks and undetermined termini + itself from ``proteoform_start`` / ``proteoform_end``; the full-length + ``coverage`` stays aligned to the full sequence. Because the full protein is + rendered, the residue grid index IS the protein-absolute 0-based position, so + ``sequence_offset`` is always 0 (the residue-click cross-link then emits + positions that already match tag ``StartPos`` / ``EndPos`` directly). + + Columns emitted: ``proteoform_index`` (filter key), ``sequence`` (str, FULL + protein), ``precursor_charge`` (=1, neutral/deconvolved peaks), ``coverage`` + (full-length list[f64]), ``maxCoverage`` (f64), ``fixed_modifications`` + (list[str]), ``sequence_offset`` (=0), ``proteoform_start`` / ``proteoform_end`` + (int, sentinel ``-2`` carried through unchanged), ``computed_mass`` (f64), + ``precursor_mass`` (f64, from protein_dfs), and ``mod_ranges`` + (list[struct{start,end,mass_diff,labels}]). ``sequence_data`` is loaded with ``use_polars=True`` and arrives in EITHER of - two formats which are handled identically here: - - * **parquet (current ``parseTnT``):** ``build_table(sequence_data)`` -> - ``parquet_sink`` writes one row per proteoform (schema in - ``src/render/sequence_data_store.py``). With ``use_polars=True`` the loader - returns a polars ``LazyFrame`` (the proteoform-index column is - ``proteoform_index``; ``sequence`` is the FULL protein ``list[str]`` and - ``coverage`` is the matching full-length ``list[f64]``). + two formats handled identically here: + + * **parquet (current ``parseTnT``):** one row per proteoform (schema in + ``src/render/sequence_data_store.py``) returned as a polars ``LazyFrame``. * **pickle dict (legacy ``.pkl.gz`` example caches):** a dict keyed by the - proteoform index, each value a dict carrying the same ``sequence`` / - ``coverage`` / ``maxCoverage`` / ``proteoform_start`` / ``proteoform_end`` / - ``fixed_modifications`` keys. ``use_polars=True`` leaves the unpickled dict - untouched. - - In both formats ``sequence`` and ``coverage`` are full-length protein lists; we - slice the displayed proteoform substring AND its coverage together with the - 0-based ``proteoform_start`` / ``proteoform_end`` bounds (a negative/absent - bound means render the full protein).""" + proteoform index; each value a dict with the same keys.""" if not file_manager.result_exists(experiment_id, "sequence_data"): return None store = file_manager.get_results( @@ -366,45 +494,51 @@ def _build_sequence_frame( else: return None + precursor_masses = _precursor_mass_lookup(file_manager, experiment_id) + proteoform_indices: List[int] = [] sequences: List[str] = [] coverages: List[list] = [] max_coverages: List[float] = [] fixed_mods: List[list] = [] sequence_offsets: List[int] = [] + proteoform_starts: List[int] = [] + proteoform_ends: List[int] = [] + computed_masses: List[float] = [] + precursor_mass_col: List[float] = [] + mod_ranges_col: List[list] = [] for entry in rows: pid = entry.get("proteoform_index") if pid is None: continue + # Emit the FULL protein sequence + full-length coverage (no slicing): the + # Vue side derives truncation / undetermined termini from + # proteoform_start/end, so the residue grid index already equals the + # protein-absolute position (sequence_offset = 0). full = list(entry.get("sequence") or []) cov = list(entry.get("coverage") or []) start = entry.get("proteoform_start") end = entry.get("proteoform_end") - # Slice the displayed proteoform substring AND its coverage together so the - # two stay aligned (the legacy SequenceView rendered the substring). A - # negative/absent bound means render the full protein. - if start is None or end is None or start < 0 or end < 0: - sub_seq, sub_cov = full, cov - offset = 0 - else: - sub_seq, sub_cov = full[start:end + 1], cov[start:end + 1] - offset = int(start) - # Coordinate decision (a): the displayed sequence is the proteoform - # substring starting at protein position `start`, so grid index i maps to - # protein-absolute position `start + i`. We carry `offset` (clamped to >= 0, - # mirroring the legacy AminoAcidCell.start getter) so the residue-click - # cross-link emits protein-absolute positions matching tag StartPos/EndPos. - # For the bundled example (start=0, end=-2 => full protein) offset is 0, so - # the emitted position equals the grid index exactly as before — the example - # render and coordinates are unchanged. proteoform_indices.append(int(pid)) - sequences.append("".join(str(a) for a in sub_seq)) - coverages.append([float(c) for c in sub_cov]) + sequences.append("".join(str(a) for a in full)) + coverages.append([float(c) for c in cov]) mc = entry.get("maxCoverage") max_coverages.append(float(mc) if mc is not None else 0.0) fm = entry.get("fixed_modifications") or [] fixed_mods.append([str(m) for m in fm]) - sequence_offsets.append(max(offset, 0)) + # Full protein rendered => residue grid index IS protein-absolute position. + sequence_offsets.append(0) + # Carry the determined-region bounds through UNCHANGED, including the + # `-2` (UNDETERMINED_TERMINUS) sentinel. Absent bound => 0 / last residue + # default on the Vue side (no truncation); we default to 0 / len-1 here. + proteoform_starts.append(int(start) if start is not None else 0) + proteoform_ends.append( + int(end) if end is not None else (len(full) - 1 if full else 0) + ) + cm = entry.get("computed_mass") + computed_masses.append(float(cm) if cm is not None else -1.0) + precursor_mass_col.append(float(precursor_masses.get(int(pid), 0.0))) + mod_ranges_col.append(_normalize_mod_ranges(entry.get("modifications"))) if not proteoform_indices: return None @@ -417,6 +551,11 @@ def _build_sequence_frame( "maxCoverage": max_coverages, "fixed_modifications": fixed_mods, "sequence_offset": sequence_offsets, + "proteoform_start": proteoform_starts, + "proteoform_end": proteoform_ends, + "computed_mass": computed_masses, + "precursor_mass": precursor_mass_col, + "mod_ranges": mod_ranges_col, }) return out.lazy() @@ -451,13 +590,26 @@ def _build_sequence_view(file_manager, experiment_id: str, cache_dir: str): sequence_data=seq_frame, peaks_data=peaks, filters={PROTEIN_KEY: "proteoform_index"}, - # Two click sources: a fragment-table row click sets MASS_KEY to the - # matched peak's peak_id (combined-spectrum cross-link), and a SequenceView - # RESIDUE click sets AA_KEY to the clicked residue's protein-absolute - # position via the "residue_position" sentinel (Tag-Table range filter). - interactivity={MASS_KEY: "peak_id", AA_KEY: "residue_position"}, + # Click / span sources (all routed through the interactivity mapping): + # - MASS_KEY: a fragment-table row click sets it to the matched peak's + # peak_id (combined-spectrum cross-link). + # - AA_KEY: a RESIDUE click sets it to the clicked residue's protein- + # absolute position via the "residue_position" sentinel (Tag-Table + # range filter). Now that the full protein is rendered, the grid index + # already IS the protein-absolute position (sequence_offset == 0). + # - TAG_SPAN_KEY: the "tag_span" sentinel does NOT set state on click; + # Vue READS this selection value ({start,end,nTerminal}, protein- + # absolute) to bracket-highlight the selected tag's span on the + # sequence. Published by _resolve_tag_masses. + interactivity={ + MASS_KEY: "peak_id", + AA_KEY: "residue_position", + TAG_SPAN_KEY: "tag_span", + }, deconvolved=True, compute_fixed_mods=True, + # TnT path: keep the variable/custom-mod context menu disabled (default). + disable_variable_modifications=True, settings=settings, title="Sequence View", cache_path=cache_dir, @@ -495,6 +647,12 @@ def _build_combined_spectrum(file_manager, experiment_id: str, cache_dir: str): x_column="MonoMass", y_column="SumIntensity", signal_peak_column="is_signal", + # Charge-state drill-down: per deconv-peak row, the list of constituent + # signal-peak m/z / charge / intensity (present on combined_spectrum_long + # from a fresh parse; empty lists for non-signal peaks). + signal_mz_column="signal_mzs", + signal_charge_column="signal_charges", + signal_intensity_column="signal_intensities", x2_column=x2, y2_column=y2, tag_filters={TAG_MASSES_KEY: "MonoMass"}, @@ -561,14 +719,31 @@ def _build_heatmap( } -def build_component(file_manager, experiment_id: str, cache_dir: str, comp_name: str): +def build_component( + file_manager, experiment_id: str, cache_dir: str, comp_name: str, + best_per_spectrum: bool = True, +): """Instantiate the OpenMS-Insight component for a layout cell, or None.""" builder = COMPONENT_BUILDERS.get(comp_name) if builder is None: return None + if comp_name == "protein_table": + return _build_protein_table( + file_manager, experiment_id, cache_dir, + best_per_spectrum=best_per_spectrum, + ) return builder(file_manager, experiment_id, cache_dir) +def _clear_proteoform_dependent_selections(state_manager) -> None: + """Clear the per-proteoform downstream selections (mirrors legacy + TabulatorProteinTable.vue:235-237, which resets the selected tag / tag data / + selected AA on a proteoform change). We also clear the resolved tag masses, + tag span and selected mass so no stale tag/peak highlight survives the switch.""" + for ident in (TAG_KEY, TAG_MASSES_KEY, AA_KEY, TAG_SPAN_KEY, MASS_KEY): + state_manager.clear_selection(ident) + + def render_experiment_panel( experiment_id: str, layout_info_per_exp: List[List[str]], @@ -584,6 +759,32 @@ def render_experiment_panel( state_manager = StateManager(session_key=session_key) cache_dir = _component_cache_dir(file_manager, experiment_id) + # Selection clearing on proteoform change (legacy + # TabulatorProteinTable.vue:235-237): when the selected proteinIndex differs + # from the last-seen one for THIS panel, clear the downstream per-proteoform + # selections (tag / tag masses / selected AA / tag span / selected mass) BEFORE + # building components so no stale selection leaks across proteoforms. + last_seen_key = f"{session_key}__last_protein_index" + current_protein = state_manager.get_selection(PROTEIN_KEY) + if st.session_state.get(last_seen_key, "__unset__") != current_protein: + _clear_proteoform_dependent_selections(state_manager) + st.session_state[last_seen_key] = current_protein + + # Best-per-spectrum toggle (legacy default ON). Per-panel widget key so + # side-by-side panels toggle independently. Shown only when a protein table is + # in the layout. + has_protein_table = any( + "protein_table" in row for row in layout_info_per_exp + ) + best_per_spectrum = True + if has_protein_table: + best_per_spectrum = st.checkbox( + "Best per spectrum", + value=True, + key=f"tnt_oi_best_per_spectrum_{panel_index}", + help="Show only the highest-scoring proteoform per spectrum (scan).", + ) + # Resolve the selected tag (scalar TagIndex set by the Tag Table) into its # list of masses BEFORE rendering so the combined-spectrum tagger overlay # sees the up-to-date `tagMasses` selection this rerun. @@ -594,7 +795,8 @@ def render_experiment_panel( for col, (col_index, comp_name) in zip(columns, enumerate(row)): with col: component = build_component( - file_manager, experiment_id, cache_dir, comp_name + file_manager, experiment_id, cache_dir, comp_name, + best_per_spectrum=best_per_spectrum, ) if component is None: st.warning(f"No data for '{comp_name}'.") diff --git a/src/parse/deconv.py b/src/parse/deconv.py index 922a25d8..f872d0c9 100644 --- a/src/parse/deconv.py +++ b/src/parse/deconv.py @@ -89,19 +89,88 @@ def _explode_long_by_position(indexed_lf, id_col, value_exprs): return lf.select(["index", id_col] + out_names) +# Each per-mass entry of the nested ``SignalPeaks`` column is a list of matched +# signal peaks, and each signal peak is a 4-tuple ``[peak_index, mz, intensity, +# charge]`` (verified against the example caches under example-data/workspaces/**: +# ``SignalPeaks`` has dtype ``List(List(List(Float64)))`` — scan → mass → peak → +# quadruple). The charge drill-down ("Augmented Annotated Spectrum") needs, for +# each deconvolved mass, the per-mass mz / charge / intensity of its signal peaks +# (not just the ``is_signal`` boolean). These expressions pull those three axes +# out of the nested column WITHOUT flattening across masses: each evaluates to a +# per-scan ``List(List(...))`` whose outer position is the mass axis (aligned to +# ``MonoMass`` / ``peak_id``) and whose inner list is that one mass's small signal +# arrays. _explode_long_by_position then gathers the inner list by mass position, +# so each output row carries exactly the signal peaks belonging to that peak. +def _signal_mzs_expr(): + # mz is element index 1 of each [idx, mz, intensity, charge] signal peak. + return pl.col("SignalPeaks").list.eval( + pl.element().list.eval(pl.element().list.get(1)) + ) + + +def _signal_intensities_expr(): + # intensity is element index 2 of each signal-peak quadruple. + return pl.col("SignalPeaks").list.eval( + pl.element().list.eval(pl.element().list.get(2)) + ) + + +def _signal_charges_expr(): + # charge is element index 3; stored as float in the nested array, cast to i64. + return pl.col("SignalPeaks").list.eval( + pl.element().list.eval(pl.element().list.get(3).cast(pl.Int64)) + ) + + +def _signal_is_signal_expr(): + # Per-mass boolean: True where that mass has >=1 matched signal peak. + return pl.col("SignalPeaks").list.eval(pl.element().list.len() > 0) + + +def _fill_empty_signal_lists(lf): + """Replace ``null`` signal/charge/intensity cells with empty lists. + + Positions past the per-mass ``SignalPeaks`` axis (e.g. a ragged scan whose + full-spectrum ``mz_array`` is longer than ``SignalPeaks``) explode to ``null`` + list cells. Coerce them to empty lists so consumers always read a list (parity + with ``is_signal`` being filled to ``False``), and every signal-flagged peak + has equal-length ``signal_mzs`` / ``signal_charges`` / ``signal_intensities``. + """ + return lf.with_columns([ + pl.col("signal_mzs").fill_null([]).alias("signal_mzs"), + pl.col("signal_charges").fill_null([]).alias("signal_charges"), + pl.col("signal_intensities").fill_null([]).alias("signal_intensities"), + ]) + + def deconv_spectrum_long(pl_deconv_indexed): - """One row per deconvolved peak: index, peak_id, MonoMass, SumIntensity. + """One row per deconvolved peak with the per-mass signal-peak arrays. + + Columns: index, peak_id, MonoMass, SumIntensity, signal_mzs (list[f64]), + signal_charges (list[i64]), signal_intensities (list[f64]). Long-format replacement for the array-valued ``deconv_spectrum`` frame, consumed by ``LinePlot(filters={'scanIndex':'index'}, x_column='MonoMass', y_column='SumIntensity')``. + + The three ``signal_*`` list columns carry, for the deconvolved mass at this + ``peak_id`` position, the mz / charge / intensity of each of its matched + signal peaks (the per-mass ``SignalPeaks[peak_id]`` axis). They are aligned to + each other (same length, one entry per signal peak of this mass) and to the + ``peak_id``/``MonoMass`` row, and back the "Augmented Annotated Spectrum" + charge drill-down. A peak with no matched signal (or a ragged past-end + position) carries empty lists. """ - return _explode_long_by_position( + lf = _explode_long_by_position( pl_deconv_indexed, "peak_id", [("MonoMass", pl.col("mz_array")), - ("SumIntensity", pl.col("intensity_array"))], + ("SumIntensity", pl.col("intensity_array")), + ("signal_mzs", _signal_mzs_expr()), + ("signal_charges", _signal_charges_expr()), + ("signal_intensities", _signal_intensities_expr())], ) + return _fill_empty_signal_lists(lf) def anno_spectrum_long(pl_anno_indexed): @@ -121,9 +190,12 @@ def anno_spectrum_long(pl_anno_indexed): def combined_spectrum_long(pl_deconv_indexed): - """One row per deconvolved peak with a signal-membership flag. + """One row per deconvolved peak with a signal-membership flag and the + per-mass signal-peak arrays. - Columns: index, peak_id, MonoMass, SumIntensity, is_signal (bool). + Columns: index, peak_id, MonoMass, SumIntensity, is_signal (bool), + signal_mzs (list[f64]), signal_charges (list[i64]), + signal_intensities (list[f64]). ``is_signal`` is True when the corresponding per-mass entry of the nested ``SignalPeaks`` column is non-empty, i.e. the deconvolved mass at that @@ -132,23 +204,36 @@ def combined_spectrum_long(pl_deconv_indexed): the same position). ``SignalPeaks`` is the per-mass axis and in real output can be SHORTER than ``mz_array``; positions beyond its length therefore have no signal entry and are flagged ``False`` (parity with the JS ``undefined`` - → no-signal). This is the long-format counterpart of the array-valued - ``combined_spectrum`` deconv side; the annotated overlay is provided - separately by ``anno_spectrum_long`` (the OpenMS-Insight LinePlot reads the - 2nd series from its own ``x2_column``/``y2_column`` frame). + → no-signal). + + ``signal_mzs`` / ``signal_charges`` / ``signal_intensities`` carry, for the + mass at this ``peak_id`` position, the mz / charge / intensity of each matched + signal peak (the contents of ``SignalPeaks[peak_id]``). The three lists are + mutually aligned (one entry per signal peak of this mass, equal length) and + aligned to the ``peak_id``/``MonoMass`` row; they back the legacy "Augmented + Annotated Spectrum" charge drill-down. When ``is_signal`` is False (no matched + signal, or a ragged past-end position) all three lists are empty. + + This is the long-format counterpart of the array-valued ``combined_spectrum`` + deconv side; the annotated overlay is provided separately by + ``anno_spectrum_long`` (the OpenMS-Insight LinePlot reads the 2nd series from + its own ``x2_column``/``y2_column`` frame). """ - # Per-mass boolean list: True where that mass has >=1 signal peak. Aligned to - # the SignalPeaks (per-mass) axis; _explode_long_by_position gathers it by the - # same position id as MonoMass and yields null past its end, coerced to False. - is_signal_list = pl.col("SignalPeaks").list.eval(pl.element().list.len() > 0) + # Per-mass list columns, all aligned to the SignalPeaks (per-mass) axis. + # _explode_long_by_position gathers each by the same position id as MonoMass + # and yields null past its end (coerced below to False / empty lists). lf = _explode_long_by_position( pl_deconv_indexed, "peak_id", [("MonoMass", pl.col("mz_array")), ("SumIntensity", pl.col("intensity_array")), - ("is_signal", is_signal_list)], + ("is_signal", _signal_is_signal_expr()), + ("signal_mzs", _signal_mzs_expr()), + ("signal_charges", _signal_charges_expr()), + ("signal_intensities", _signal_intensities_expr())], ) - return lf.with_columns(pl.col("is_signal").fill_null(False)) + lf = lf.with_columns(pl.col("is_signal").fill_null(False)) + return _fill_empty_signal_lists(lf) def mass_table_long(pl_deconv_indexed): @@ -185,6 +270,37 @@ def mass_table_long(pl_deconv_indexed): ] return _explode_long_by_position(pl_deconv_indexed, "mass_id", value_exprs) +def threedim_SN_plot(pl_deconv_indexed): + """3D signal/noise scatter frame with precursor-lookup keys. + + Columns: index, Scan (i64), PrecursorScan (f64), PrecursorMass (f64), + MonoMass (list[f64], == ``mz_array``), SignalPeaks, NoisyPeaks. + + Carries the precursor-lookup keys the OpenMS-Insight Scatter3D needs to match + a fragment scan's precursor back to the mass that generated it: each (MS2) scan + row has its own ``Scan`` id, the ``PrecursorScan`` it was isolated from, the + scalar ``PrecursorMass``, and the per-mass ``MonoMass`` array of that scan. The + viewer locates the precursor scan's row (``Scan == PrecursorScan``) and finds + the index into that scan's ``MonoMass`` array whose value matches + ``PrecursorMass``, using the same per-mass position (``SignalPeaks[massIndex]``) + as the rest of the 3D plot. ``Scan`` / ``PrecursorScan`` are the join keys, + ``PrecursorMass`` is the scalar to match, ``MonoMass`` is the per-mass axis to + search. + """ + return ( + pl_deconv_indexed + .select([ + pl.col('index'), + pl.col('Scan'), + pl.col('PrecursorScan'), + pl.col('PrecursorMass'), + pl.col('mz_array').alias('MonoMass'), + pl.col('SignalPeaks'), + pl.col('NoisyPeaks') + ]) + ) + + def parseDeconv( file_manager, dataset_id, out_deconv_mzML, anno_annotated_mzML, spec1_tsv=None, spec2_tsv=None, logger=None @@ -391,16 +507,8 @@ def parseDeconv( logger.log("80.0 %", level=2) - # 3D_SN_plot - using native polars LazyFrame operations - threedim_SN_plot_lazy = ( - pl_deconv_indexed - .select([ - pl.col('index'), - pl.col('PrecursorScan'), - pl.col('SignalPeaks'), - pl.col('NoisyPeaks') - ]) - ) + # 3D_SN_plot - precursor-lookup keys + signal/noise peaks (see threedim_SN_plot). + threedim_SN_plot_lazy = threedim_SN_plot(pl_deconv_indexed) file_manager.store_data(dataset_id, 'threedim_SN_plot', threedim_SN_plot_lazy) logger.log("90.0 %", level=2) diff --git a/tests/test_deconv_long_format.py b/tests/test_deconv_long_format.py index 25b00355..96d2add4 100644 --- a/tests/test_deconv_long_format.py +++ b/tests/test_deconv_long_format.py @@ -42,6 +42,7 @@ combined_spectrum_long, deconv_spectrum_long, mass_table_long, + threedim_SN_plot, ) @@ -86,7 +87,10 @@ def _max_len_expansion(row, cols): def test_deconv_spectrum_long_schema_and_rowcount(): df = deconv_spectrum_long(_deconv().lazy()).collect() - assert df.columns == ["index", "peak_id", "MonoMass", "SumIntensity"] + assert df.columns == [ + "index", "peak_id", "MonoMass", "SumIntensity", + "signal_mzs", "signal_charges", "signal_intensities", + ] # 4 + 0 + 2 + 1 = 7 assert df.height == 7 @@ -135,7 +139,10 @@ def test_mass_table_long_rowcount_and_empty_scan(): def test_combined_spectrum_long_is_signal(): deconv = _deconv() df = combined_spectrum_long(deconv.lazy()).collect() - assert df.columns == ["index", "peak_id", "MonoMass", "SumIntensity", "is_signal"] + assert df.columns == [ + "index", "peak_id", "MonoMass", "SumIntensity", "is_signal", + "signal_mzs", "signal_charges", "signal_intensities", + ] dpd = deconv.to_pandas() for r in df.iter_rows(named=True): sp = dpd[dpd["index"] == r["index"]].iloc[0]["SignalPeaks"] @@ -147,6 +154,124 @@ def test_combined_spectrum_long_is_signal(): assert row3["is_signal"] is False +# Signal-peak quadruple layout in the nested SignalPeaks column: +# [peak_index, mz, intensity, charge]. (Verified against the example caches: +# SignalPeaks has dtype List(List(List(Float64))) — scan -> mass -> peak -> tuple.) +_SP_MZ, _SP_INT, _SP_CH = 1, 2, 3 + + +def _check_signal_arrays(df): + """Shared assertions for the per-mass signal_* list columns on a long frame.""" + # Column dtypes: lists of f64 / i64 / f64. + assert df.schema["signal_mzs"] == pl.List(pl.Float64) + assert df.schema["signal_intensities"] == pl.List(pl.Float64) + assert df.schema["signal_charges"] == pl.List(pl.Int64) + + deconv = _deconv() + dpd = deconv.to_pandas() + for r in df.iter_rows(named=True): + sp = dpd[dpd["index"] == r["index"]].iloc[0]["SignalPeaks"] + pid = r["peak_id"] + peaks = list(sp[pid]) if pid < len(sp) else [] + + mzs = r["signal_mzs"] + chs = r["signal_charges"] + ints = r["signal_intensities"] + + # Never null — past-end / no-signal positions are empty lists. + assert mzs is not None and chs is not None and ints is not None + # The three arrays are mutually aligned (one entry per signal peak). + assert len(mzs) == len(chs) == len(ints) == len(peaks) + # Contents match the per-mass signal peaks at this position. + assert mzs == [p[_SP_MZ] for p in peaks] + assert ints == [p[_SP_INT] for p in peaks] + assert chs == [int(p[_SP_CH]) for p in peaks] + + # Alignment with is_signal (combined frame only) / non-emptiness. + if "is_signal" in r: + assert bool(r["is_signal"]) == (len(mzs) > 0) + + +def test_deconv_spectrum_long_signal_arrays(): + df = deconv_spectrum_long(_deconv().lazy()).collect() + _check_signal_arrays(df) + # Concrete spot check: scan 0, peak 0 has one signal peak (mz 1000.1, ch 1). + r0 = df.filter((pl.col("index") == 0) & (pl.col("peak_id") == 0)).row(0, named=True) + assert r0["signal_mzs"] == [1000.1] + assert r0["signal_charges"] == [1] + assert r0["signal_intensities"] == [10.0] + # Non-signal peak (scan 0, peak 1) and ragged past-end peak (scan 0, peak 3) + # both carry empty lists. + for pid in (1, 3): + rr = df.filter((pl.col("index") == 0) & (pl.col("peak_id") == pid)).row(0, named=True) + assert rr["signal_mzs"] == [] + assert rr["signal_charges"] == [] + assert rr["signal_intensities"] == [] + + +def test_combined_spectrum_long_signal_arrays(): + df = combined_spectrum_long(_deconv().lazy()).collect() + _check_signal_arrays(df) + # Signal-flagged peaks have non-empty, equal-length signal_* lists; non-signal + # peaks have empty lists across all three. + for r in df.iter_rows(named=True): + if r["is_signal"]: + assert len(r["signal_mzs"]) > 0 + assert len(r["signal_mzs"]) == len(r["signal_charges"]) == len(r["signal_intensities"]) + else: + assert r["signal_mzs"] == [] + assert r["signal_charges"] == [] + assert r["signal_intensities"] == [] + + +def _deconv_3d(): + # MS1 precursor scan (scan 100) with two masses; MS2 fragment scan (scan 101) + # isolated from precursor mass 2000.2 in scan 100. + return pl.DataFrame( + { + "Scan": [100, 101], + "PrecursorScan": [0.0, 100.0], + "PrecursorMass": [0.0, 2000.2], + "mz_array": [[1000.1, 2000.2], [3000.3]], + "intensity_array": [[10.0, 20.0], [30.0]], + "SignalPeaks": [ + [[[0.0, 1000.1, 10.0, 1.0]], [[1.0, 2000.2, 20.0, 2.0]]], + [[[0.0, 3000.3, 30.0, 1.0]]], + ], + "NoisyPeaks": [ + [[], []], + [[]], + ], + } + ).with_row_index("index") + + +def test_threedim_SN_plot_precursor_lookup_columns(): + df = threedim_SN_plot(_deconv_3d().lazy()).collect() + assert df.columns == [ + "index", "Scan", "PrecursorScan", "PrecursorMass", + "MonoMass", "SignalPeaks", "NoisyPeaks", + ] + # Precursor-lookup key dtypes. + assert df.schema["Scan"] == pl.Int64 + assert df.schema["PrecursorScan"] == pl.Float64 + assert df.schema["PrecursorMass"] == pl.Float64 + # MonoMass is the per-mass array (== mz_array). + assert df.schema["MonoMass"] == pl.List(pl.Float64) + + # The MS2 fragment scan's precursor resolves to a mass in its precursor scan: + # find the precursor-scan row (Scan == PrecursorScan) and the MonoMass index + # matching PrecursorMass — the position the Scatter3D uses for SignalPeaks. + ms2 = df.filter(pl.col("Scan") == 101).row(0, named=True) + assert ms2["PrecursorScan"] == 100.0 + assert ms2["PrecursorMass"] == 2000.2 + prec = df.filter(pl.col("Scan") == int(ms2["PrecursorScan"])).row(0, named=True) + mass_index = prec["MonoMass"].index(ms2["PrecursorMass"]) + assert mass_index == 1 + # That per-mass position carries the matching signal peaks in the precursor scan. + assert len(prec["SignalPeaks"][mass_index]) > 0 + + def test_peak_id_and_mass_id_share_mass_axis(): deconv = _deconv() ds = deconv_spectrum_long(deconv.lazy()).collect() From ed7a504ee75244de51933acddfb208cc09f5b624 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 13:14:46 +0000 Subject: [PATCH 09/10] Round-2 viewer fixes: raw formatters, selectedAA, Scatter3D guard, Precursor Re-audit follow-ups: - Tables: drop precision from the dashNegativeOne columns (ProteoformMass, ProteoformLevelQvalue, Nmass, Cmass) so they show the RAW value like legacy (precision-4 was destroying tiny Q-values); keep the -1 -> "-" sentinel. - TnT tag walk: publish selectedAA (= selectedAApos - StartPos when the selected residue is within the tag) so the selected-residue gold highlight renders; drop zero-valued tag masses (legacy filters != 0); silently skip absent components instead of st.warning (match Deconv). - Deconv Scatter3D: pass the precursor-lookup columns only when all four are present in the frame, so stale 4-column threedim_SN_plot caches no longer crash the panel (ValueError) and degrade to the legacy per-scan view. - Deconv SequenceView: stop emitting computed_mass (it forced the TnT branch, mislabeled the header "Proteoform", and disabled the variable-mod menu); emit the observed precursor_mass from the selected scan so the "Precursor" header renders and variable mods stay enabled. - Remove dead code (_data_path, unused imports) and drop the inert internal_fragment_map from the Deconv selectable layout (parity with TnT). --- .../FLASHDeconv/FLASHDeconvLayoutManager.py | 8 +- content/FLASHDeconv/FLASHDeconvViewerOI.py | 113 +++++++++++++----- content/FLASHTnT/FLASHTnTViewerOI.py | 90 +++++++++----- 3 files changed, 149 insertions(+), 62 deletions(-) diff --git a/content/FLASHDeconv/FLASHDeconvLayoutManager.py b/content/FLASHDeconv/FLASHDeconvLayoutManager.py index a2094d2b..0b1d1cc8 100644 --- a/content/FLASHDeconv/FLASHDeconvLayoutManager.py +++ b/content/FLASHDeconv/FLASHDeconvLayoutManager.py @@ -219,11 +219,13 @@ def handleSettingButtons(): def setSequenceView(): if get_sequence() is not None: + # Parity with the TnT layout: `internal_fragment_map` was dropped because + # neither the legacy grid nor the OI viewer renders it (it produces + # nothing). Only the sequence view is added on sequence submission. global COMPONENT_OPTIONS - COMPONENT_OPTIONS = COMPONENT_OPTIONS + ['Sequence view (Mass table needed)', - 'Internal fragment map (Mass table needed)'] + COMPONENT_OPTIONS = COMPONENT_OPTIONS + ['Sequence view (Mass table needed)'] global COMPONENT_NAMES - COMPONENT_NAMES = COMPONENT_NAMES + ['sequence_view', 'internal_fragment_map'] + COMPONENT_NAMES = COMPONENT_NAMES + ['sequence_view'] # page initialization diff --git a/content/FLASHDeconv/FLASHDeconvViewerOI.py b/content/FLASHDeconv/FLASHDeconvViewerOI.py index df7eb9d4..a9c0ecc1 100644 --- a/content/FLASHDeconv/FLASHDeconvViewerOI.py +++ b/content/FLASHDeconv/FLASHDeconvViewerOI.py @@ -30,7 +30,7 @@ from __future__ import annotations from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import List, Optional import polars as pl import streamlit as st @@ -45,10 +45,7 @@ Table, ) -from content.FLASHDeconv.deconv_sequence import ( - bake_fixed_modifications, - theoretical_mass, -) +from content.FLASHDeconv.deconv_sequence import bake_fixed_modifications # Map the layout COMPONENT_NAMES (FLASHDeconvLayoutManager) to a builder. Every # builder returns a *callable* OpenMS-Insight component already wired with the @@ -109,15 +106,6 @@ def _component_cache_dir(file_manager, experiment_id: str) -> str: return str(cache_root) -def _data_path(file_manager, experiment_id: str, name_tag: str) -> Optional[str]: - """Resolve the on-disk parquet path for a stored frame, or None if absent.""" - if not file_manager.result_exists(experiment_id, name_tag): - return None - res = file_manager.get_results(experiment_id, [name_tag], partial=True) - path = res.get(name_tag) - return str(path) if path is not None else None - - def _lazy(file_manager, experiment_id: str, name_tag: str) -> Optional[pl.LazyFrame]: """Load a stored frame as a polars LazyFrame, or None if absent.""" if not file_manager.result_exists(experiment_id, name_tag): @@ -281,6 +269,25 @@ def _build_scatter3d(file_manager, experiment_id: str, cache_dir: str): data = _lazy(file_manager, experiment_id, "threedim_SN_plot") if data is None: return None + # MS2 precursor-signal lookup: locate the precursor scan's row + # (Scan == PrecursorScan) and the index into its MonoMass array whose value + # matches PrecursorMass. Fresh parses (src/parse/deconv.py) emit all four + # columns (7-col frame), but STALE/OLD ``threedim_SN_plot.pq`` caches only + # carry the 4 legacy columns (index, PrecursorScan, SignalPeaks, NoisyPeaks). + # Scatter3D._validate_mappings raises ValueError if any precursor column is + # configured-but-missing, and this builder runs OUTSIDE the page try/except, + # so we MUST schema-gate: pass the precursor params ONLY when ALL FOUR + # columns are present; otherwise fall back to the legacy per-scan behavior. + schema_names = data.collect_schema().names() + precursor_cols = ("Scan", "PrecursorScan", "PrecursorMass", "MonoMass") + precursor_kwargs = {} + if all(col in schema_names for col in precursor_cols): + precursor_kwargs = { + "scan_column": "Scan", + "precursor_scan_column": "PrecursorScan", + "precursor_mass_column": "PrecursorMass", + "mono_mass_column": "MonoMass", + } # 3D S/N plot: scanIndex value-filters on `index`; massIndex handled internally # as an array subscript (NOT a value filter). return Scatter3D( @@ -289,16 +296,9 @@ def _build_scatter3d(file_manager, experiment_id: str, cache_dir: str): scan_filter="index", signal_column="SignalPeaks", noisy_column="NoisyPeaks", - # MS2 precursor-signal lookup: locate the precursor scan's row - # (Scan == PrecursorScan) and the index into its MonoMass array whose - # value matches PrecursorMass. All four columns are emitted on - # threedim_SN_plot by src/parse/deconv.py. - scan_column="Scan", - precursor_scan_column="PrecursorScan", - precursor_mass_column="PrecursorMass", - mono_mass_column="MonoMass", title="Precursor Signals", cache_path=cache_dir, + **precursor_kwargs, ) @@ -318,6 +318,49 @@ def _build_fdr_plot(file_manager, experiment_id: str, cache_dir: str): ) +def _selected_precursor_mass(file_manager, experiment_id: str, state_manager): + """Observed PrecursorMass of the currently-selected scan, or None. + + Mirrors the legacy "Precursor" mass header (src/render/update.py get_sequence + / per-scan data), which reads ``PrecursorMass`` from the selected scan. The + selected scan is the ``scanIndex`` selection (== the scan_table ``index``); + we look its ``PrecursorMass`` up in the scan_table frame. Returns None when no + scan is selected, the table/column is absent, or the value is 0.0 (the legacy + sentinel for "scan not eligible for this view", which renders an empty header). + """ + if state_manager is None: + return None + selected_index = state_manager.get_selection(SCAN_KEY) + if selected_index is None: + return None + scan_table = _lazy(file_manager, experiment_id, "scan_table") + if scan_table is None: + return None + names = scan_table.collect_schema().names() + if "index" not in names or "PrecursorMass" not in names: + return None + try: + row = ( + scan_table.filter(pl.col("index") == selected_index) + .select("PrecursorMass") + .collect() + ) + except Exception: + return None + if row.height == 0: + return None + value = row["PrecursorMass"][0] + if value is None: + return None + try: + value = float(value) + except (TypeError, ValueError): + return None + if value == 0.0: + return None + return value + + def _get_sequence(file_manager): """Return the submitted (sequence, fix_C, fix_M) tuple, or None.""" if not file_manager.result_exists("sequence", "sequence"): @@ -374,17 +417,29 @@ def _build_sequence_view( pl.col("SumIntensity").alias("intensity"), ) - # Pass the sequence as a single-row frame so we can attach the optional - # `computed_mass` column (the baked sequence's monoisotopic mass) for the - # SequenceView mass header. Falls back to a plain string when pyOpenMS is - # unavailable (theoretical_mass returns None) so the column is simply omitted. - seq_mass = theoretical_mass(sequence_string) - if seq_mass is not None: + # Mass header parity (legacy Deconv shows the "Precursor" header: + # Theoretical / Observed / Δ). We MUST NOT emit `computed_mass` here: in the + # OI SequenceView Vue, `displayTnT = (computedMass !== undefined)`, which would + # (a) mislabel the header "Proteoform" instead of "Precursor", and (b) force + # `disableVariableModifications = true`, silently disabling the variable/custom + # modification context menu that this path explicitly enables via + # `disable_variable_modifications=False`. So `computed_mass` stays dropped. + # + # Instead emit `precursor_mass` = the OBSERVED precursor mass of the selected + # scan (legacy reads `PrecursorMass` from the selected scan; see + # src/render/update.py get_sequence). When a scan is selected and its + # PrecursorMass is reachable in the scan_table, wire it so the "Precursor" + # header renders; otherwise omit it (header observed/Δ rows render empty) -- + # either way the variable-mod menu stays enabled. + precursor_mass = _selected_precursor_mass( + file_manager, experiment_id, state_manager + ) + if precursor_mass is not None: sequence_data = pl.LazyFrame( { "sequence": [sequence_string], "precursor_charge": [1], - "computed_mass": [seq_mass], + "precursor_mass": [precursor_mass], } ) else: diff --git a/content/FLASHTnT/FLASHTnTViewerOI.py b/content/FLASHTnT/FLASHTnTViewerOI.py index bd224773..dfbe5cbc 100644 --- a/content/FLASHTnT/FLASHTnTViewerOI.py +++ b/content/FLASHTnT/FLASHTnTViewerOI.py @@ -170,20 +170,22 @@ def _stamp_proteoform_index( {"title": "Accession", "field": "accession", "sorter": "string"}, {"title": "Description", "field": "description", "sorter": "string"}, {"title": "Length", "field": "length", "sorter": "number"}, - # Legacy TabulatorProteinTable.vue renders the `-1` sentinel as "-" (the raw - # value otherwise). The OI `dashNegativeOne` formatter reproduces the sentinel - # rule; precision 4 matches the app-wide `toFixedFormatter()` default (4 dp, - # used for every other numeric mass column, e.g. MonoMass in the mass table). + # Legacy TabulatorProteinTable.vue renders the `-1` sentinel as "-" and the + # RAW unrounded value otherwise (TabulatorProteinTable.vue:78-81). The OI + # `dashNegativeOne` formatter reproduces the sentinel rule and renders the raw + # value when `precision` is omitted -- so we drop `formatterParams` to avoid + # rounding (critical for tiny Q-values: 0.00012 must NOT become 0.0001). {"title": "Mass", "field": "ProteoformMass", "sorter": "number", - "formatter": "dashNegativeOne", "formatterParams": {"precision": 4}}, + "formatter": "dashNegativeOne"}, {"title": "No. of Matched Fragments", "field": "MatchingFragments", "sorter": "number"}, {"title": "No. of Modifications", "field": "ModCount", "sorter": "number"}, {"title": "No. of Tags", "field": "TagCount", "sorter": "number"}, {"title": "Score", "field": "Score", "sorter": "number"}, - # Q-Value also uses the `-1 -> "-"` sentinel rule (legacy formatter); 4 dp - # matches the app-wide default decimal precision. + # Q-Value also uses the `-1 -> "-"` sentinel rule with the RAW unrounded value + # otherwise (TabulatorProteinTable.vue:105-108). No `formatterParams` so the + # raw value is shown -- rounding would corrupt tiny Q-values (e.g. 0.00012). {"title": "Q-Value (Proteoform Level)", "field": "ProteoformLevelQvalue", "sorter": "number", - "formatter": "dashNegativeOne", "formatterParams": {"precision": 4}}, + "formatter": "dashNegativeOne"}, ] # TabulatorTagTable.vue columns -> tag_dfs fields. @@ -194,12 +196,13 @@ def _stamp_proteoform_index( {"title": "Sequence", "field": "TagSequence", "sorter": "string"}, {"title": "Length", "field": "Length", "sorter": "number"}, {"title": "Tag Score", "field": "Score", "sorter": "number"}, - # N/C mass use the legacy `-1 -> "-"` sentinel rule (TabulatorTagTable.vue - # ~72-83); precision 4 matches the app-wide mass decimal default. + # N/C mass use the legacy `-1 -> "-"` sentinel rule and render the RAW + # unrounded value otherwise (TabulatorTagTable.vue:72-83). No `formatterParams` + # so the raw value is shown (rounding would lose precision on the mass offset). {"title": "N mass", "field": "Nmass", "sorter": "number", - "formatter": "dashNegativeOne", "formatterParams": {"precision": 4}}, + "formatter": "dashNegativeOne"}, {"title": "C mass", "field": "Cmass", "sorter": "number", - "formatter": "dashNegativeOne", "formatterParams": {"precision": 4}}, + "formatter": "dashNegativeOne"}, {"title": "Δ mass", "field": "DeltaMass", "sorter": "number"}, ] @@ -310,15 +313,18 @@ def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None Only the selected tag's row is collected (filtered by ``TagIndex``). The tag ``mzs`` are a comma-joined string (trailing comma); parse and drop non-numeric - entries, keeping the STORED order (ascending for C-term tags, descending for - N-term tags). ``TagSequence`` gives the residue letters; the legacy walks - consecutive stored masses labelling gap ``i`` with ``sequence[len-1-i]`` — - i.e. the REVERSED sequence aligns to the stored-order gaps regardless of - anchoring (verified against both an ascending C-term and a descending N-term - tag). Do NOT sort the masses: sorting breaks the alignment for descending - (N-term) tags. The published value is a dict - ``{"masses": [...], "residues": [...]}`` consumed by the OI LinePlot tag walk; - when no residues are available it carries only masses (highlight-only).""" + AND zero entries (legacy ``number !== 0``), keeping the STORED order (ascending + for C-term tags, descending for N-term tags). ``TagSequence`` gives the residue + letters; the legacy walks consecutive stored masses labelling gap ``i`` with + ``sequence[len-1-i]`` — i.e. the REVERSED sequence aligns to the stored-order + gaps regardless of anchoring (verified against both an ascending C-term and a + descending N-term tag). Do NOT sort the masses: sorting breaks the alignment + for descending (N-term) tags. The published value is a dict + ``{"masses": [...], "residues": [...], "nTerminal": bool}`` consumed by the OI + LinePlot tag walk; when no residues are available it carries only masses + (highlight-only). When a residue within the selected tag's span is also + selected (``selectedAApos``), a tag-relative ``selectedAA`` index is added so + the walk gold-highlights that residue (legacy ``selectedAApos - StartPos``).""" def _clear_all() -> None: state_manager.clear_selection(TAG_MASSES_KEY) state_manager.clear_selection(TAG_SPAN_KEY) @@ -355,8 +361,12 @@ def _clear_all() -> None: raw = selected["tag_masses"][0] # Keep STORED order (do not sort) so the reversed-sequence walk aligns for - # both ascending (C-term) and descending (N-term) tags. - masses = [m for m in raw if m is not None] if raw is not None else [] + # both ascending (C-term) and descending (N-term) tags. Drop null AND zero + # masses (legacy `number !== 0`, TabulatorTagTable.vue:140): a literal 0 mass + # would misalign the reversed-residue walk. + masses = ( + [m for m in raw if m is not None and m != 0] if raw is not None else [] + ) if not masses: _clear_all() return @@ -373,15 +383,33 @@ def _clear_all() -> None: n_mass = selected["n_mass"][0] n_terminal = (n_mass is not None) and (float(n_mass) == -1.0) - state_manager.set_selection( - TAG_MASSES_KEY, - {"masses": list(masses), "residues": residues, "nTerminal": n_terminal}, - ) + tag_masses = { + "masses": list(masses), + "residues": residues, + "nTerminal": n_terminal, + } - # Tag-span highlight on the SequenceView. StartPos/EndPos are protein-absolute - # (matching the full-protein residue grid), so they bracket the tag directly. + # Selected-residue gold (#F3A712) highlight (legacy + # `selectedTag.selectedAA = selectedAApos - StartPos`, TabulatorTagTable.vue: + # 151,169). When a residue is selected (AA_KEY holds its protein-absolute + # position) AND it falls within the selected tag's [StartPos, EndPos] span, + # publish the tag-relative residue index so the LinePlot tag walk highlights + # that residue; omit otherwise (no highlight). start_pos = selected["start_pos"][0] end_pos = selected["end_pos"][0] + selected_aa_pos = state_manager.get_selection(AA_KEY) + if ( + selected_aa_pos is not None + and start_pos is not None + and end_pos is not None + and int(start_pos) <= int(selected_aa_pos) <= int(end_pos) + ): + tag_masses["selectedAA"] = int(int(selected_aa_pos) - int(start_pos)) + + state_manager.set_selection(TAG_MASSES_KEY, tag_masses) + + # Tag-span highlight on the SequenceView. StartPos/EndPos are protein-absolute + # (matching the full-protein residue grid), so they bracket the tag directly. if start_pos is not None and end_pos is not None: state_manager.set_selection( TAG_SPAN_KEY, @@ -799,7 +827,9 @@ def render_experiment_panel( best_per_spectrum=best_per_spectrum, ) if component is None: - st.warning(f"No data for '{comp_name}'.") + # Silently skip an absent component (data frame missing), + # matching the Deconv viewer's documented intent and avoiding + # noisy warnings on stale / partial caches. continue key = f"tnt_oi_{panel_index}_{row_index}_{col_index}_{comp_name}" component(key=key, state_manager=state_manager) From 6553258169c9dc1810d4b1fef804d8ff705bd97c Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 13:28:10 +0000 Subject: [PATCH 10/10] Fix tag-walk selected-residue highlight to always mirror The LinePlot tag walk receives residues already REVERSED to align with the stored mass order, so the tag-relative selectedAA (an N->C index) always maps to the mirrored walk gap. Publish nTerminal=False on the tagMasses walk so the LinePlot mirrors it -- geometrically correct for both N- and C-anchored tags and matching the legacy behavior (whose nTerminal was effectively always false). Driving it off Nmass==-1 misplaced the gold highlight for C-anchored tags. The SequenceView tag-span still carries the real nTerminal (Nmass==-1) for its own orientation. --- content/FLASHTnT/FLASHTnTViewerOI.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/content/FLASHTnT/FLASHTnTViewerOI.py b/content/FLASHTnT/FLASHTnTViewerOI.py index dfbe5cbc..52f96360 100644 --- a/content/FLASHTnT/FLASHTnTViewerOI.py +++ b/content/FLASHTnT/FLASHTnTViewerOI.py @@ -348,7 +348,8 @@ def _clear_all() -> None: .list.eval(pl.element().cast(pl.Float64, strict=False)) .alias("tag_masses"), pl.col("TagSequence").alias("tag_sequence"), - # Anchoring + span (legacy TabulatorTagTable.vue:142-173). + # Tag span for the selected-residue (selectedAA) highlight (legacy + # TabulatorTagTable.vue:142-173). pl.col("StartPos").alias("start_pos"), pl.col("EndPos").alias("end_pos"), pl.col("Nmass").alias("n_mass"), @@ -377,16 +378,19 @@ def _clear_all() -> None: seq = selected["tag_sequence"][0] or "" residues = list(reversed(str(seq)))[: max(len(masses) - 1, 0)] - # Terminal anchoring (legacy `nTerminal = (Nmass == -1)`): an N-terminal tag is - # one whose N-mass is the `-1` sentinel. Forwarded into the tag-walk so the - # LinePlot honors the requested direction. - n_mass = selected["n_mass"][0] - n_terminal = (n_mass is not None) and (float(n_mass) == -1.0) - + # Selected-residue highlight direction. The residue list above is already + # REVERSED to align with the stored mass order, so a tag-relative `selectedAA` + # (an N->C index) always maps to the MIRRORED walk gap (gaps-1-selectedAA). + # Publish `nTerminal=False` so the LinePlot mirrors it: this is geometrically + # correct for BOTH N- and C-terminal-anchored tags AND matches the legacy + # behavior, whose `nTerminal` was effectively always false (it read a + # non-existent `row["N mass"]` key, so the legacy walk always mirrored). Driving + # the direction off `Nmass == -1` here instead would misplace the gold highlight + # for C-anchored tags relative to the (already reversed) residue letters. tag_masses = { "masses": list(masses), "residues": residues, - "nTerminal": n_terminal, + "nTerminal": False, } # Selected-residue gold (#F3A712) highlight (legacy @@ -411,6 +415,11 @@ def _clear_all() -> None: # Tag-span highlight on the SequenceView. StartPos/EndPos are protein-absolute # (matching the full-protein residue grid), so they bracket the tag directly. if start_pos is not None and end_pos is not None: + # Real terminal anchoring (legacy `selectedTag.nTerminal = (N mass == -1)`) + # for the SequenceView tag-span orientation -- distinct from the LinePlot + # walk above, which always mirrors because its residues are pre-reversed. + n_mass = selected["n_mass"][0] + n_terminal = (n_mass is not None) and (float(n_mass) == -1.0) state_manager.set_selection( TAG_SPAN_KEY, {"start": int(start_pos), "end": int(end_pos), "nTerminal": n_terminal},