@Proceedings{MIDL2026,
  title =     {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  booktitle = {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  editor =    {Yuankai Huo and Mingchen Gao and Chang-Fu Kuo and Yueming Jin and Ruining Deng},
  publisher = {PMLR},
  series =    {Proceedings of Machine Learning Research},
  volume =    315
}


@InProceedings{pmlr-v315-chen26a,
  title = 	 {Beyond Classification: Elaborating Network Predictions for Better Weakly Supervised Quantization},
  author =       {Chen, Chih-Chieh and Kuo, Chang-Fu},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1--20},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/chen26a/chen26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/chen26a.html},
  abstract = 	 {For clinical applications, more detailed information such as specific locations and the region of interest (ROI) volumes is preferred. However, most of the time only classification annotations are available. Class Activation Mapping (CAM) and its variants are the most commonly used techniques for weakly supervised localization tasks. In this study, we assessed both traditional and modern network architectures regarding classification accuracy and CAM visualization. Although all networks achieved high AUROC scores and their heatmaps closely corresponded to pathology locations, we observed that the heatmaps were influenced by the particular network architectures and pretrained weights used. Additionally, current models produce heatmaps from small latent spaces (e.g. $16 \times 16$), which limits the precision of these heatmaps for further detailed analysis. Based on the observations mentioned above, we designed a UNet-style architecture that utilizes pretrained classification networks as the encoder and produces heatmaps within a latent space of size $128 \times 128$. We observed that the generated heatmaps are more detailed and suitable for weakly supervised segmentation. We validated the effectiveness of our approach using the intracerebral hemorrhage (ICH) dataset.}
}


@InProceedings{pmlr-v315-shen26a,
  title = 	 {Style Content Decomposition-based Data Augmentation for Domain Generalizable Medical Image Segmentation},
  author =       {Shen, Zhiqiang and Cao, Peng and Yang, Jinzhu and Zaiane, Osmar R. and Chen, Zhaolin},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {21--35},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/shen26a/shen26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/shen26a.html},
  abstract = 	 {Due to domain shifts across diverse medical imaging modalities, learned segmentation models often suffer significant performance degradation during deployment. We posit that these domain shifts can be categorized into two main components: (1) "style" shifts, referring to global disparities in image properties such as illumination, contrast, and color; and (2) "content" shifts, involving local discrepancies in anatomical structures. To address the domain shifts in medical image segmentation, we first factorize an image into style codes and content maps, explicitly modeling the "style" and "content" components. Building on this, we introduce a Style-Content decomposition-based data augmentation algorithm (StyCona), which performs augmentation on both the global style and local content of source-domain images, enabling the training of a well-generalized model for domain generalizable medical image segmentation. StyCona is a simple yet effective plug-and-play module that substantially improves model generalization without requiring additional training parameters or modifications to segmentation model architectures. Experiments on cardiac magnetic resonance imaging and fundus photography segmentation tasks, with single and multiple target domains respectively, demonstrate the effectiveness of StyCona and its superiority over state-of-the-art domain generalization methods.}
}


@InProceedings{pmlr-v315-kwak26a,
  title = 	 {Beyond Machine Interpretation: Learning from Expert Over-Reads Improves ECG Diagnosis},
  author =       {Kwak, Sunwoo and Liu, Fengbei and Nizam, Nusrat B. and Richter, Ilan and Uriel, Nir and Okin, Peter M. and Sabuncu, Mert R.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {36--55},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/kwak26a/kwak26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/kwak26a.html},
  abstract = 	 {Automated machine-read ECG interpretations are widely used in clinical practice but often unreliable, leading to systematic diagnostic errors. This work investigates how training with cardiologist over-reads impacts model accuracy and clinical reliability. Using a large paired corpus of over two million ECGs containing both machine and expert interpretations, we evaluate three learning paradigms: (i) supervised learning on expert over-read labels, (ii) Self-training that extends expert supervision to public ECGs, and (iii) multimodal contrastive learning with CLIP and NegCLIP. Across all settings, models trained with expert over-read data consistently outperform those trained on machine-read labels, especially for rare but clinically important conditions. Self-training and NegCLIP further demonstrate scalable strategies to propagate expert knowledge beyond labeled datasets. These findings highlight the essential role of expert over-reads in developing trustworthy and clinically aligned ECG AI systems.}
}


@InProceedings{pmlr-v315-friedrich26a,
  title = 	 {MedFuncta: A Unified Framework for Learning Efficient Medical Neural Fields},
  author =       {Friedrich, Paul and Bieder, Florentin and McGinnis, Julian and Wolleb, Julia and Rueckert, Daniel and Cattin, Philippe C.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {56--87},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/friedrich26a/friedrich26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/friedrich26a.html},
  abstract = 	 {Research in medical imaging primarily focuses on discrete data representations that poorly scale with grid resolution and fail to capture the often continuous nature of the underlying signal. Neural Fields (NFs) offer a powerful alternative by modeling data as continuous functions. While single-instance NFs have successfully been applied in medical contexts, extending them to large-scale medical datasets remains an open challenge. We therefore introduce MedFuncta, a unified framework for large-scale NF training on diverse medical signals. Building on Functa, our approach encodes data into a unified representation, namely a 1D latent vector, that modulates a shared, meta-learned NF, enabling generalization across a dataset. We revisit common design choices, introducing a non-constant frequency parameter $\omega$ in widely used SIREN activations, and establish a connection between this $\omega$-schedule and layer-wise learning rates, relating our findings to recent work in theoretical learning dynamics. We additionally introduce a scalable meta-learning strategy for shared network learning that employs sparse supervision during training, thereby reducing memory consumption and computational overhead while maintaining competitive performance. Finally, we evaluate MedFuncta across a diverse range of medical datasets and show how to solve relevant downstream tasks on our neural data representation. To promote further research in this direction, we release our code, model weights and the first large-scale dataset - MedNF - containing $>500k$ latent vectors for multi-instance medical NFs. }
}


@InProceedings{pmlr-v315-ochs26a,
  title = 	 {Multimodal Assessment of Pancreatic Cancer Resectability Using Deep Learning},
  author =       {Ochs, Vincent and Kuemmerli, Christoph and Bieder, Florentin and Wolleb, Julia and Lavanchy, Jo\"el L. and Ruppel, Julia and Liechti, Jan and Taha-Mehlitz, Stephanie and Nebiker, Christian Andreas and M\"uller, Beat and Fusai, Giuseppe Kito and Pollok, Joerg-Matthias and Taha, Anas and Cattin, Philippe C. and Staubli, Sebastian},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {88--106},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/ochs26a/ochs26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/ochs26a.html},
  abstract = 	 {Accurate determination of pancreatic ductal adenocarcinoma (PDAC) resectability relies on evaluating how the tumor interacts with major peripancreatic vessels on CT imaging, yet expert assessment often shows substantial variability. We introduce a fully automated multimodal deep learning framework that jointly analyzes 3D contrast enhanced CT and structured clinical information to classify patients into the three National Comprehensive Cancer Network (NCCN) resectability categories (upfront resectable, borderline resectable, locally advanced). The approach uses a Swin-UNETR backbone to obtain anatomy aware image representations through auxiliary segmentation of pancreas, tumor, and vascular structures. These features are fused with a compact clinical embedding derived from 17 routinely collected variables and processed by a lightweight classification head. Model training is guided by a dynamic multitask objective that adapts the balance between segmentation and classification based on current tumor Dice performance, promoting feature representations that remain both anatomically informed and discriminative. In a cohort of 159 patients (85 upfront resectable, 47 borderline resectable, 27 locally advanced), the proposed method achieved an AUC of 0.86, a macro-F1 of 0.79, and an accuracy of 0.85 using stratified nested 5-fold cross validation, outperforming adapted transformer based and geometric baseline approaches. External validation on an independent cohort with 52 patients from Kantonsspital Aarau (KSA Aarau) yielded an AUC of 0.86, a macro-F1 of 0.81, and an accuracy of 0.87, supporting cross-institution generalization. Notably, the external KSA Aarau cohort contained complete clinical information for all variables used by the model and therefore did not require imputation. The comparable performance observed on this dataset suggests that the KNN based imputation applied to the training cohort did not introduce a detectable performance bias for the clinical variables considered. Because segmentation labels are required only during training, the final system enables mask free inference while preserving vessel aware interpretability. These findings demonstrate that integrating anatomical supervision with clinical context yields a robust and reproducible tool for supporting operability (i.e., NCCN-based resectability) assessment in pancreatic cancer. The implementation is publicly available at , and the data, as well as the weights, can be made available by the corresponding author upon reasonable request.}
}


@InProceedings{pmlr-v315-li26a,
  title = 	 {Cross-Domain Semi-Supervised  Organ Detection},
  author =       {Li, Nian and Ghahremani, Morteza and Jian, Bailiang and Cervera, Pascual Tejero and Wiestler, Benedikt and Makowski, Marcus and Wachinger, Christian},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {107--120},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/li26a/li26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/li26a.html},
  abstract = 	 {Domain adaptation for 3D organ detection in CT imaging is challenging due to variations in scanner types, imaging protocols, and overall acquisition conditions. As supervised detection models require large, annotated datasets from diverse scanners and institutions, semi-supervised approaches have gained attention for their ability to leverage limited unlabeled target data. However, traditional semi-supervised methods typically fail to make effective use of the few labeled target samples and most often do not yield satisfactory results. To address this limitation, we introduce a novel cross-domain semi-supervised detection framework (CDSS-Det) built upon the Transformer-based Organ-DETR model. CDSS-Det is a cross-domain semi-supervised framework for 3D organ detection that addresses unreliable pseudo-labels and limited target supervision under domain shift. It introduces a curriculum-guided pseudo-labeling mechanism and domain-robust representation learning to enable effective knowledge transfer from a well-annotated source domain to a sparsely labeled target domain. Experiments on multi-domain CT datasets demonstrate that incorporating a small number of labeled target samples significantly boosts detection performance over conventional domain adaptation and semi-supervised methods. CDSS-Det consistently achieves higher mean Average Precision (mAP), with notable improvements in detecting small organs, and surpasses a fully supervised model trained solely on the labeled target domain by over 10%. These results underscore the potential of CDSS-Det in efficiently leveraging both labeled and unlabeled target data in cross-domain organ detection, advancing annotation-efficient deep learning models in medical imaging.}
}


@InProceedings{pmlr-v315-wu26a,
  title = 	 {Simplex-Aligned Diffusion with Cross-Granularity Interaction for Robust Medical Image Classification},
  author =       {Wu, Chao and Gao, Mingchen},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {121--152},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wu26a/wu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wu26a.html},
  abstract = 	 {The clinical deployment of medical image classification systems hinges on their trustworthiness, specifically, the ability to provide calibrated uncertainty estimates and maintain robustness under acquisition shifts. While generative diffusion models offer promising distributional modeling, existing approaches suffer from a fundamental geometric conflict: they apply unbounded Gaussian noise directly to bounded label simplices. We identify that this theoretical mismatch forces predictions into invalid probability spaces, serving as a primary source of model unreliability and overconfidence. To resolve this, we propose Simplex-Aligned Diffusion. Unlike standard methods, we reformulate the label generation process on an unconstrained logit manifold. By mapping the probability simplex to a Euclidean space, we ensure mathematical consistency with Gaussian diffusion, which effectively acts as a geometric regularizer for uncertainty calibration. Furthermore, we introduce a Transformer-based Cross-Granularity Interaction module to stabilize visual guidance by dynamically modeling global-local dependencies. Extensive experiments on the APTOS2019 and HAM10000 benchmarks demonstrate that our framework not only achieves competitive accuracy but significantly outperforms state-of-the-art baselines in calibration error (ECE) and resilience to clinical artifacts (e.g., sensor noise, blur), offering a mathematically rigorous and clinically reliable paradigm.Code is available at }
}


@InProceedings{pmlr-v315-herten26a,
  title = 	 {GeoReg: Direct biplanar DSA-to-CTA registration with geodesic consistency for acute ischemic stroke},
  author =       {van Herten, Rudolf L. M. and Graf, Robert and Bitzer, Felix and Kirschke, Jan S. and Paetzold, Johannes C.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {153--171},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/herten26a/herten26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/herten26a.html},
  abstract = 	 {The complementary nature of pre-procedural computed tomography angiography (CTA) and intraoperative digital subtraction angiography (DSA) has motivated significant interest in their registration to enhance therapeutic decision-making during stroke interventions. However, current methods depend on accurate vessel segmentation in both modalities, creating a deployment bottleneck due to the requirement for extensive annotated training data. Here, we present an alternative approach that establishes the feasibility of registration without this dependency. Instead of extracting vascular features using pre-trained models, we optimize a direct registration framework using maximum intensity projections of DSA sequences to align a silhouette of the subtracted X-ray image. We introduce a geodesic consistency formulation that jointly optimizes biplanar views, employing soft geometric constraints on SO(3) to maintain consistency while accommodating non-orthogonal scanner configurations. We demonstrate the effectiveness of this model on clinical stroke data and find that it outperforms existing methods, proving particularly effective in escaping local minima where single-view optimization fails. These results indicate that reliable DSA-to-CTA registration is achievable without vessel-specific training data, simplifying the path toward clinical integration.}
}


@InProceedings{pmlr-v315-li26b,
  title = 	 {CARE: Confidence-aware Ratio Estimation for Medical Biomarkers},
  author =       {Li, Jiameng and Popordanoska, Teodora and Tiulpin, Aleksei and Gruber, Sebastian G. and Maes, Frederik and Blaschko, Matthew B.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {172--197},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/li26b/li26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/li26b.html},
  abstract = 	 {Ratio-based biomarkers (RBBs), such as the proportion of necrotic tissue within a tumor, are widely used in clinical practice to support diagnosis, prognosis, and treatment planning. These biomarkers are typically estimated from segmentation outputs by computing region-wise ratios. Despite the high-stakes nature of clinical decision making, existing methods provide only point estimates, offering no measure of uncertainty. In this work, we propose a unified confidence-aware framework for estimating ratio-based biomarkers. Our uncertainty analysis stems from two observations: (1) the probability ratio estimator inherently admits a statistical confidence interval regarding local randomness (bias and variance); (2) the segmentation network is not perfectly calibrated (calibration error). We perform a systematic analysis of error propagation in the segmentation-to-biomarker pipeline and identify model miscalibration as the dominant source of uncertainty. Extensive experiments show that our method produces statistically sound confidence intervals, with tunable confidence levels, enabling more trustworthy application of segmentation-derived RBBs in clinical workflows. Codes: }
}


@InProceedings{pmlr-v315-gallee26a,
  title = 	 {FunnyNodules: A Customizable Medical Dataset Tailored for Evaluating Explainable AI},
  author =       {Gall{\'e}e, Luisa and Xiong, Yiheng and Beer, Meinrad and G{\"o}tz, Michael},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {198--214},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/gallee26a/gallee26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/gallee26a.html},
  abstract = 	 {Densely annotated medical image datasets that capture not only diagnostic labels but also the underlying reasoning behind these diagnoses are scarce. Such reasoning-related annotations are essential for developing and evaluating explainable AI (xAI) models that reason similarly to radiologists: making correct predictions for the right reasons. To address this gap, we introduce FunnyNodules, a fully parameterized synthetic dataset designed for systematic analysis of attribute-based reasoning in medical AI models. The dataset generates abstract lung nodule–like shapes with controllable visual attributes such as roundness, margin sharpness, and spiculation. The target class is derived from a predefined attribute combination, allowing full control over the decision rule that links attributes to the diagnostic class. We demonstrate how FunnyNodules can be used in model-agnostic evaluations to assess whether models learn correct attribute–target relations, to interpret over- or underperformance in attribute prediction, and to analyze attention alignment with attribute-specific regions of interest. The framework is fully customizable, supporting variations in dataset complexity, target definitions, class balance, and beyond. With complete ground truth information, FunnyNodules provides a versatile foundation for developing, benchmarking, and conducting in-depth analyses of explainable AI methods in medical image analysis.}
}


@InProceedings{pmlr-v315-saeed26a,
  title = 	 {Machine-Learning Based Detection of Coronary Artery Calcification Using Synthetic Chest X-Rays},
  author =       {Saeed, Dylan and Gharleghi, Ramtin and Beier, Susann and Singh, Sonit},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {215--231},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/saeed26a/saeed26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/saeed26a.html},
  abstract = 	 {Coronary artery calcification (CAC) is a strong predictor of cardiovascular events, with computed tomography (CT)-based Agatston scoring widely regarded as the clinical gold standard. However, CT is costly and impractical for large-scale screening, while chest X-rays (CXRs) are inexpensive but lack reliable ground truth labels, constraining deep learning development. Digitally reconstructed radiographs (DRRs) offer a scalable alternative by projecting CT volumes into CXR-like images while inheriting precise labels. In this work, we provide the first systematic evaluation of DRRs as a surrogate training domain for CAC detection. Using 667 CT scans from the COCA dataset, we generate synthetic DRRs (posterior–anterior and lateral views per scan) and assess model capacity, super-resolution (SR) fidelity enhancement, preprocessing, and training strategies. Lightweight convolutional neural networks (CNNs) trained from scratch outperform large pretrained networks (DenseNet121, ResNet18); pairing super-resolution with contrast enhancement yields significant gains; and curriculum learning stabilises training under weak supervision. Our best configuration achieves a mean area under the receiver operating characteristic curve (AUC) of 0.754, comparable to or exceeding prior CXR-based studies. These results establish DRRs as a scalable, label-rich foundation for CAC detection, while laying the foundation for future transfer learning and domain adaptation to real CXRs.}
}


@InProceedings{pmlr-v315-mia26a,
  title = 	 {D$^2$-Former: Mixture-Of-Experts Guided Dual Transformer for Multi-Scale Medical Image Segmentation},
  author =       {Mia, Md Sohag and Taourirte, Aya and Adnan, Muhammad Abdullah and Ming, Wenlong},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {232--255},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/mia26a/mia26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/mia26a.html},
  abstract = 	 {Precise delineation of anatomical structures from medical images is critical for clinical diagnosis and treatment planning, yet remains profoundly challenging due to ambiguous boundaries, extreme scale variations, and the heterogeneous appearances of pathological tissues. Current segmentation methods frequently fall short in effectively balancing global contextual understanding with adaptive, multi-scale feature fusion, limiting their robustness across diverse clinical scenarios. To address these limitations, we propose D2-Former, a novel encoder-decoder framework that integrates a dual-encoder architecture–combining a Swin Transformer for hierarchical local-global modeling and a DINOv3 foundation model for high-fidelity dense feature extraction—with a Softer Mixture-of-Experts (Softer-MoE) module for input-adaptive feature refinement. Our design further introduces a Spatial-Frequency Gated Channel Attention (SF-GCA) module to fuse complementary encoder representations and a Residual Attention Decoder (RAD) with deep supervision for progressive map reconstruction. Extensive experiments across nine public benchmarks–spanning polyp segmentation, retinal vessel delineation, multi-organ abdominal CT segmentation, and nuclei instance segmentation–demonstrate that D2-Former achieves state-of-the-art or highly competitive performance. The model exhibits strong generalization across varied anatomical scales, imaging modalities, and clinical scenarios, underscoring its potential for reliable computer-assisted diagnosis.}
}


@InProceedings{pmlr-v315-komura26a,
  title = 	 {PAGET: Hierarchical Multi-Teacher Knowledge Distillation for Comprehensive Tumor Microenvironment Segmentation},
  author =       {Komura, Daisuke and Takao, Maki and Ochi, Mieko and Onoyama, Takumi and Katoh, Hiroto and Abe, Hiroyuki and Sano, Hiroyuki and Konishi, Teppei and Kumasaka, Toshio and Yokose, Tomoyuki and Miyagi, Yohei and Ushiku, Tetsuo and Ishikawa, Shumpei},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {256--279},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/komura26a/komura26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/komura26a.html},
  abstract = 	 {Comprehensive characterization of the tumor microenvironment (TME) from H&E-stained histopathology images remains challenging due to the diversity of cellular components and limitations of current segmentation methods. We present PAGET (Pathological image segmentation via AGgrEgated Teachers), a multi-teacher knowledge distillation framework that enables simultaneous segmentation of 13 TME components from a single efficient model. Our key insight is that teacher predictions should be aggregated following the biological taxonomy of cell types—from tissue-level context through major cell categories to specific subtypes—rather than simple voting. By training specialized teachers on immunohistochemical restaining data and distilling their aggregated knowledge, the resulting student model not only matches but consistently outperforms the teacher ensemble on external datasets. We provide two complementary variants: PAGET-S for rapid semantic segmentation and PAGET-H for detailed panoptic segmentation. Extensive evaluation across three external datasets demonstrates robust generalization.}
}


@InProceedings{pmlr-v315-rystrom26a,
  title = 	 {OxEnsemble: Fair Ensembles for Low-Data Classification},
  author =       {Rystr{\o}m, Jonathan and Fu, Zihao and Russell, Chris},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {280--307},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/rystrom26a/rystrom26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/rystrom26a.html},
  abstract = 	 {We address the problem of fair classification in settings where data is scarce and unbalanced across demographic groups. Such low-data regimes are common in domains like medical imaging, where false negatives can have fatal consequences. We propose a novel approach OxEnsemble for efficiently training ensembles and enforcing fairness in these low-data regimes. Unlike other approaches, we aggregate predictions across ensemble members, each trained to satisfy fairness constraints. By construction, OxEnsemble is both data-efficient – carefully reusing held-out data to enforce fairness reliably – and compute-efficient, requiring little more compute than used to fine-tune or evaluate an existing model. We validate this approach with new theoretical guarantees. Experimentally, our approach yields more consistent outcomes and stronger fairness-accuracy trade-offs than existing methods across multiple challenging medical imaging classification datasets.}
}


@InProceedings{pmlr-v315-avci26a,
  title = 	 {DIST-CLIP: Arbitrary Metadata and Image Guided MRI Harmonization via Disentangled Anatomy-Contrast Representations},
  author =       {Avci, Mehmet Yigit and Borges, Pedro and Fernandez, Virginia and Wright, Paul and Yigitsoy, Mehmet and Ourselin, Sebastien and Cardoso, Jorge},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {308--321},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/avci26a/avci26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/avci26a.html},
  abstract = 	 {Deep learning holds immense promise for transforming medical image analysis, yet its clinical generalization remains profoundly limited. A major barrier is data heterogeneity. This is particularly true in Magnetic Resonance Imaging, where scanner hardware differences, diverse acquisition protocols, and varying sequence parameters introduce substantial domain shifts that obscure underlying biological signals. Data harmonization methods aim to reduce these instrumental and acquisition variability, but existing approaches remain insufficient. When applied to imaging data, image-based harmonization approaches are often restricted by the need for target images (i.e., mapping source to target modality given a reference image), while existing text-guided methods rely on simplistic labels that fail to capture complex acquisition details or are typically restricted to datasets with limited variability (i.e., mapping source to target modality given some conditioning text), failing to capture the heterogeneity of real-world clinical environments. To address these limitations, we propose DIST-CLIP (Disentangled Style Transfer with CLIP Guidance), a unified framework for MRI harmonization that flexibly uses either target images or DICOM metadata for guidance. Our framework explicitly disentangles anatomical content from image contrast, with the contrast representations being extracted using pre-trained CLIP encoders. These contrast embeddings are then integrated into the anatomical content via a novel Adaptive Style Transfer module. We trained and evaluated DIST-CLIP on diverse real-world clinical datasets, and showed significant improvements in performance when compared against state-of-the-art methods in both style translation fidelity and anatomical preservation, offering a flexible solution for style transfer and standardizing MRI data.}
}


@InProceedings{pmlr-v315-topolnjak26a,
  title = 	 {Orientation Normalization of Multi-Stain Skin Tissue Cross-Sections},
  author =       {Topolnjak, Ema and Paulides, Evi and Blokx, Willeke A. M. and Veta, Mitko and Lucassen, Ruben T.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {322--341},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/topolnjak26a/topolnjak26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/topolnjak26a.html},
  abstract = 	 {Efficient examination of skin tissue specimens is key for pathologists to keep up with an increasing workload. Normalizing the orientation of tissue cross-sections before manual assessment could contribute to a more streamlined digital workflow. In this study, we compare multiple deep learning-based approaches for predicting the rotation angle required to correct the misorientation of skin tissue cross-sections. The models were developed and evaluated using a dataset of 10,649 H&E-stained and 9,731 IHC-stained cross-section images from specimens with melanocytic lesions. Our results show that framing rotation angle prediction as a classification task with the circular target space divided into separate classes performed best, reaching mean absolute errors of 2.77$^\circ$ and 3.56$^\circ$ on the test sets of H&E and IHC-stained cross-sections, respectively, approaching the level of human annotators. Automated orientation normalization, when implemented in whole slide image viewers, could make tissue examination more efficient and convenient for pathologists, while also serving as a valuable preprocessing step for the development of position-aware or multi-stain deep learning models.}
}


@InProceedings{pmlr-v315-wang26a,
  title = 	 {Concept Complement Bottleneck Model for Interpretable Medical Image Diagnosis},
  author =       {Wang, Hongmei and Hou, Junlin and He, Sunan and Yang, Shu and Chen, Hao},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {342--359},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wang26a/wang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wang26a.html},
  abstract = 	 {Models based on human-understandable concepts have received extensive attention to improve model interpretability for trustworthy artificial intelligence in the field of medical image analysis. These methods can provide convincing explanations for model decisions but heavily rely on detailed annotations of predefined concepts. Consequently, they are ineffective when concepts or annotations are incomplete or of low quality. Although some methods can automatically discover novel and effective visual concepts instead of relying on predefined ones, or generate human-understandable concepts using large language models, they often deviate from medical diagnostic evidence and remain difficult to interpret. In this paper, we propose a concept complement bottleneck model for interpretable medical image diagnosis. Specifically, we use cross-attention modules to extract key image features related to the predefined textual concepts and employ independent concept adapters and bottleneck layers to distinguish concepts more effectively. Additionally, we devise a concept complement module to mine local concepts from the concept bank constructed using medical literature. The model jointly learns expert-annotated predefined concepts and automatically discovered ones to improve performance in concept detection and disease diagnosis. Comprehensive experiments demonstrate that our model outperforms state-of-the-art methods while providing diverse and interpretable explanations. }
}


@InProceedings{pmlr-v315-dunn-beltran26a,
  title = 	 {Seeing Down the Line: Endoscopic Reconstruction with Centerline Constraints},
  author =       {Dunn Beltran, Andrea and Hardy, Romain and Rajpurkar, Pranav},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {360--378},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/dunn-beltran26a/dunn-beltran26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/dunn-beltran26a.html},
  abstract = 	 {Colonoscopy remains the gold standard for colorectal cancer screening, but there is still no real-time, geometry-aware way to quantify which parts of the colon have been inspected during a procedure. We revisit 3D Gaussian endoscopic reconstruction as a representation and geometry problem rather than a new network design. Assuming known camera poses and off-the-shelf depth or photometric supervision, we add a simple centerline-based coordinate system and priors on top of an existing Gaussian mapping backbone. From the noisy pose stream we maintain an online centerline and Bishop frame, assign each Gaussian tubular coordinates $(s,r,\theta)$, and use these coordinates both to regularize the map toward a hollow tube and to accumulate coverage statistics in colon-intrinsic space. On long C3VD phantom colonoscopy sequences, this lightweight modification achieves Chamfer distance comparable to or better than an endoscopy-specific 3D Gaussian SLAM baseline while running at frame rates close to MonoGS and yielding improved rendering quality, with negligible additional computation. At the same time, the same representation produces unrolled colon views and segment-wise coverage summaries essentially "for free", making centerline-aware Gaussian mapping a practical drop-in component for future real-time quality monitoring tools in colonoscopy.}
}


@InProceedings{pmlr-v315-wang26b,
  title = 	 {RVO-MIS: Robust Visual Odometry for Minimally Invasive Surgery},
  author =       {Wang, Zhuo and Chien, Chiang-Heng and Lee, Eungjoo},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {379--399},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wang26b/wang26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wang26b.html},
  abstract = 	 {Visual odometry (VO) in minimally invasive surgery (MIS) scenarios plays a crucial role in current and future endoscopic surgical intervention assistance systems. However, MIS environments pose severely challenging situations for typical VO algorithms due to textureless scenes, the presence of surgical instruments, light reflections, flowing blood and organ deformation, { etc}. Classic VO methods adopt a smooth motion prior to generate an initial guess for camera pose and then refine it through minimizing reprojection errors. Recent deep learning methods incorporate learned depths and estimate camera poses through minimizing photometric residuals. These approaches, however, lack robustness in estimation due to abrupt motion change and unpredictable illumination changes commonly seen in MIS environments. In this paper, we present RVO-MIS, a robust VO framework in MIS by first integrating SIFT and LightGlue for reliable feature correspondences, and then solving a sequence of absolute camera poses under a M-estimator sample consensus (MSAC) scheme. By advocating the absolute-pose-first formulation to prioritize geometric consistency and robustness, our approach decouples the camera motion tracking from smooth motion prior, photometric consistency, learned depths, { etc}. Through evaluations on the SCARED and EndoSLAM datasets, RVO-MIS demonstrates consistently accurate camera pose estimations. In challenging MIS situations where many methods fail or become inaccurate, RVO-MIS excels in both camera trajectory completion rate and accuracy.}
}


@InProceedings{pmlr-v315-wang26c,
  title = 	 {LabelG: Consistent Pairwise 3D CT Image and Segmentation Mask Generation via Medical Foundation Models},
  author =       {Wang, Lu-Yan and Wang, Tzung-Dau and Lai, Shang-Hong},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {400--413},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wang26c/wang26c.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wang26c.html},
  abstract = 	 {Medical image generation is increasingly used for data augmentation in tasks such as segmentation. However, most existing approaches focus solely on synthesizing high-quality images, while the corresponding segmentation masks are generated separately or may lack structural alignment with the images. To address this limitation, we introduce LabelG, a lightweight module that works with pretrained 3D CT diffusion foundation models to produce paired CT images and segmentation masks in a single sampling pass. LabelG decodes multi-scale latent features using a split-MLP architecture and aggregates predictions via a voting mechanism to yield anatomically coherent image–mask pairs, without requiring ground-truth masks or textual prompts at inference time. Experiments on four CT datasets demonstrate that the generated pairs achieve high visual fidelity and can improve downstream segmentation performance when used to augment limited real data. LabelG offers an efficient and scalable approach for synthesizing paired medical data, helping enhance data efficiency in medical image segmentation.}
}


@InProceedings{pmlr-v315-banerjee26a,
  title = 	 {DiffTAC: Temporal-Conditioned Latent Diffusion with Integrated Attention for Intermediate Frame Generation and Temporal Super-Resolution in Cardiac MRI},
  author =       {Banerjee, Shilajit and Sinha, Aniruddha},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {414--432},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/banerjee26a/banerjee26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/banerjee26a.html},
  abstract = 	 {Cardiac cine MRI captures dynamic cardiac motion, yet its temporal resolution remains fundamentally constrained by long acquisition times and breath-hold requirements. We introduce DiffTAC, a latent diffusion framework that synthesizes intermediate cardiac phases by treating time as an explicit conditioning variable. Using the end-diastolic (ED) and end-systolic (ES) frames as anatomical anchors, DiffTAC performs denoising in the latent space of a pretrained variational autoencoder and conditions generation on a learnable temporal embedding that specifies the desired phase location within the cardiac cycle. To effectively fuse temporal conditioning with anatomical context, we propose the Integrated Attention Block (IAB), a unified module that combines self-attention and cross-attention to modulate spatial features according to the target temporal position. This design enables the model to synthesize anatomically coherent, temporally smooth intermediate frames. Experiments on multiple publicly available datasets demonstrate that DiffTAC produces highly realistic intermediate phases and achieves superior temporal consistency compared to classical interpolation, optical-flow–based reconstruction, and ablated variants of our architecture. These findings show that modeling time as a conditioning signal within a diffusion framework provides an effective and acquisition-free solution for temporal super-resolution in cardiac MRI.}
}


@InProceedings{pmlr-v315-djoumessi26a,
  title = 	 {SoftCAM: Making black box models self-explainable for medical image analysis},
  author =       {Djoumessi, Kerol and Berens, Philipp},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {433--467},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/djoumessi26a/djoumessi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/djoumessi26a.html},
  abstract = 	 {Convolutional neural networks (CNNs) are widely used for high-stakes applications like medicine, often surpassing human performance. However, most explanation methods rely on post-hoc attribution, approximating the decision-making process of already trained black-box models. These methods are often sensitive, unreliable, and fail to reflect true model reasoning, limiting their trustworthiness in critical applications. In this work, we introduce SoftCAM, a straightforward yet effective approach that makes standard CNN architectures inherently interpretable. By removing the global average pooling layer and replacing the fully connected classification layer with a convolution-based class evidence layer, SoftCAM preserves spatial information and produces explicit class activation maps that form the basis of the model’s predictions. Evaluated on three medical datasets spanning three imaging modalities, SoftCAM maintains classification performance while significantly improving both the qualitative and quantitative explanation compared to existing post-hoc methods.}
}


@InProceedings{pmlr-v315-liu26a,
  title = 	 {MetaVoxel: Joint Diffusion Modeling of Imaging and Clinical Metadata},
  author =       {Liu, Yihao and Gao, Chenyu and Zuo, Lianrui and Kim, Michael E. and Boyd, Brian D. and Barnes, Lisa L. and Kukull, Walter A. and Beason-Held, Lori L. and Resnick, Susan M. and Hohman, Timothy J. and Taylor, Warren D. and Landman, Bennett A.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {468--487},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/liu26a/liu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/liu26a.html},
  abstract = 	 {Modern deep learning methods have achieved impressive results across tasks from disease classification, estimating continuous biomarkers, to generating realistic medical images. Most of these approaches are trained to model conditional distributions defined by a specific predictive direction with a specific set of input variables. We introduce MetaVoxel, a generative joint diffusion modeling framework that models the joint distribution over imaging data and clinical metadata by learning a single diffusion process spanning all variables. By capturing the joint distribution, MetaVoxel unifies tasks that traditionally require separate conditional models and supports flexible zero-shot inference using arbitrary subsets of inputs without task-specific retraining. Using more than $10,000$ T1-weighted MRI scans paired with clinical metadata from nine datasets, we show that a single MetaVoxel model can perform image generation, age estimation, and sex prediction, achieving performance comparable to established task-specific baselines. Additional experiments highlight its capabilities for flexible inference. Together, these findings demonstrate that joint multimodal diffusion offers a promising direction for unifying medical AI models and enabling broader clinical applicability.}
}


@InProceedings{pmlr-v315-zhu26a,
  title = 	 {SCR$^2$-ST: Combine Single Cell with Spatial Transcriptomics for  Efficient Active Sampling via Reinforcement Learning},
  author =       {Zhu, Junchao and Deng, Ruining and Guo, Junlin and Yao, Tianyuan and Qu, Chongyu and Xiong, Juming and Lu, Siqi and Lu, Zhengyi and Zhu, Yanfan and Lionts, Marilyn and Yang, Yuechen and Zheng, Yalin and Wang, Yu and Zhao, Shilin and Yang, Haichun and Huo, Yuankai},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {488--509},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zhu26a/zhu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zhu26a.html},
  abstract = 	 {Spatial transcriptomics (ST) is an emerging technology that enables researchers to investigate the molecular relationships underlying tissue morphology. However, acquiring ST data remains prohibitively expensive, and traditional fixed-grid sampling strategies lead to redundant measurements of morphologically similar or biologically uninformative regions, thus resulting in scarce data that constrain current methods. The well-established single-cell sequencing field, however, could provide rich biological data as an effective auxiliary source to mitigate this limitation. To bridge these gaps, we introduce SCR$^2$-ST, a unified framework that leverages single-cell prior knowledge to guide efficient data acquisition and accurate expression prediction. SCR$^2$-ST integrates a single-cell guided reinforcement learning-based (SCRL) active sampling and a hybrid regression-retrieval prediction network SCR$^2$Net. SCRL combines single-cell foundation model embeddings with spatial density information to construct biologically grounded reward signals, enabling selective acquisition of informative tissue regions under constrained sequencing budgets. SCR$^2$Net then leverages the actively sampled data through a hybrid architecture combining regression-based modeling with retrieval-augmented inference, where a majority cell-type filtering mechanism suppresses noisy matches and retrieved expression profiles serve as soft labels for auxiliary supervision. We evaluated SCR$^2$-ST on three public ST datasets, demonstrating SOTA performance in both sampling efficiency and prediction accuracy, particularly under low-budget scenarios.}
}


@InProceedings{pmlr-v315-lolos26a,
  title = 	 {Context-Aware Patch Representations for Multiple Instance Learning},
  author =       {Lolos, Andreas and Christodoulou, Theofilos and Moustakas, Aris L. and Christodoulidis, Stergios and Vakalopoulou, Maria},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {510--543},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/lolos26a/lolos26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/lolos26a.html},
  abstract = 	 {In computational pathology, weak supervision has become the standard for deep learning due to the gigapixel scale of WSIs and the scarcity of pixel-level annotations, with Multiple Instance Learning (MIL) established as the principal framework for slide-level model training. In this paper, we introduce , a novel setting for MIL methods, inspired by advances in Neural Partial Differential Equation (PDE) solvers. Instead of relying on complex attention-based aggregation, we propose an efficient, aggregator-agnostic framework that removes the complexity of correlation learning from the MIL aggregator. CAPRMIL produces rich context-aware patch embeddings that promote effective correlation learning on downstream tasks. By projecting patch features —extracted using a frozen patch encoder— into a small set of global context/morphology-aware tokens and utilizing multi-head self-attention, CAPRMIL injects global context with linear computational complexity with respect to the bag size. Paired with a simple Mean MIL aggregator, CAPRMIL matches state-of-the-art (SOTA) slide-level performance across multiple public pathology benchmarks, while reducing the total number of trainable parameters by $48%$–$92.8%$ versus SOTA MILs, lowering FLOPs during inference by $52%$–$99%$, and ranking among the best models on GPU memory efficiency and training time. Our results indicate that learning rich, context-aware instance representations before aggregation is an effective and scalable alternative to complex pooling for whole-slide analysis.}
}


@InProceedings{pmlr-v315-li26c,
  title = 	 {StainNet: Scaling Self-Supervised Foundation Models on Immunohistochemistry and Special Stains for Computational Pathology},
  author =       {Li, Jiawen and Hu, Jiali and Ling, Xitong and Lv, Yongqiang and Chen, Yuxuan and Wang, Yizhi and Guan, Tian and Liu, Yifei and He, Yonghong},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {544--569},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/li26c/li26c.pdf},
  url = 	 {https://proceedings.mlr.press/v315/li26c.html},
  abstract = 	 {Foundation models trained with self-supervised learning (SSL) on large-scale histological images have significantly accelerated the development of computational pathology. These models can serve as backbones for region-of-interest (ROI) image analysis or patch-level feature extractors in whole-slide images (WSIs) based on multiple instance learning (MIL). Existing pathology foundation models (PFMs) are typically pre-trained on Hematoxylin-Eosin (H&E) stained pathology images. However, images such as immunohistochemistry (IHC) and special stains are also frequently used in clinical practice. PFMs pre-trained mainly on H&E-stained images may be limited in clinical applications involving these non-H&E images. To address this issue, we propose StainNet, a a collection of self-supervised foundation models specifically trained for IHC and special stains in pathology images based on the vision transformer (ViT) architecture. StainNet contains a ViT-Small and a ViT-Base model, both of which are trained using a self-distillation SSL approach on over 1.4 million patch images extracted from 20,231 publicly available IHC and special staining WSIs in the HISTAI database. To evaluate StainNet models, we conduct experiments on three in-house slide-level IHC classification tasks, three in-house ROI-level special stain and two public ROI-level IHC classification tasks to demonstrate their strong ability. We also perform ablation studies such as few-ratio learning and retrieval evaluations, and compare StainNet models with recent larger PFMs to further highlight their strengths.}
}


@InProceedings{pmlr-v315-lin26a,
  title = 	 {OmniNet: A Multi-Modality Neural Network for Robust Remote Respiratory Rate Measurement from Facial Video},
  author =       {Lin, Tsai-Ni and Liu, An-Sheng and Fu, Li-Chen},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {570--593},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/lin26a/lin26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/lin26a.html},
  abstract = 	 {Remote respiratory rate (RR) measurement has gained traction in recent studies due to its ability to reduce healthcare professionals’ workload and patient discomfort. Recent studies have targeted this problem through remote photoplethysmography (rPPG) to capture subtle facial color changes. However, this technique is sensitive to lighting and motion variations. To this end, we propose , a multimodal neural network that integrates image data processed through 3D convolutional neural networks (3D CNNs) with point of interest (POI) motion data and passes the fused features to Bidirectional Long Short-Term Memory (BiLSTM) to model long-term temporal dependencies. achieves state-of-the-art performance by effectively capturing comprehensive spatial and temporal information while reducing illumination variation and motion-induced artifacts. It also requires fewer computational resources and enables faster inference compared to Transformer networks.}
}


@InProceedings{pmlr-v315-chen26b,
  title = 	 {Vector Quantization for Reversed Disease Progression: Further Investigations},
  author =       {Chen, Chih-Chieh and Kuo, Chang-Fu},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {594--619},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/chen26b/chen26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/chen26b.html},
  abstract = 	 {Interpretability plays a pivotal role in the collaboration between artificial intelligence (AI) systems and clinicians. It enables clinicians to critically reassess the rationale underlying AI-generated predictions. Moreover, translating these interpretations into clinically meaningful quantifications is feasible even for more granular algorithms, thereby potentially reducing the extensive annotation efforts typically required. Recently, a novel approach was introduced to generate reversed disease progression trajectories by applying conditional flow matching within the latent space of an autoencoder, jointly training a linear classifier. However, the architectural design, training procedures, and objective functions associated with the flow matching network warrant further investigation and refinement. In the present study, we implement this concept utilizing a recently proposed vector-quantized autoencoder framework incorporating Sinkhorn-based quantization. Our findings indicate that reversed disease progression can be consistently generated even in the absence of joint classifier training. Additionally, the method preserves strong spatial correspondences between the pixel domain and latent representations, enabling the synthesis of desired images through a CutMix-inspired algorithm. We demonstrate the efficacy of our approach by applying it to the weakly supervised quantization of midline shift distances.}
}


@InProceedings{pmlr-v315-xiao26a,
  title = 	 {Topological-Aware Regularization for Semi-Supervised Intracranial Aneurysm Vessel Segmentation},
  author =       {Xiao, Feiyang and Zhang, Yichi and Li, Xigui and Zhou, Yuanye and Jiang, Chen and Guo, Xin and Han, Limei and Li, Yuxin and Zhu, Fengping and Cheng, Yuan},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {620--636},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/xiao26a/xiao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/xiao26a.html},
  abstract = 	 {Accurate segmentation of intracranial aneurysm and their parent vessels (IA-Vessel) from magnetic resonance angiography is a critical prerequisite for computational fluid dynamics-based rupture risk assessment. While deep learning methods can automate this laborious task, they are hindered by the high cost and scarcity of expert annotations. Most existing semi-supervised methods focus on enforcing regional constraints while largely ignoring topological constraints, which is insensitive to subtle but critical errors like vessel adhesion or surface irregularities, which are often unsuitable for downstream applications. To address this gap, we introduce topological-aware regularization (TAR) by incorporating the learning of local vascular topology to ensure the precise and geometrically correct segmentation of the IA-Vessel complex using only a small amount of labeled data. Experimental results on a multi-center MRA dataset show that our framework efficiently utilizes unlabeled data and outperforms state-of-the-art semi-supervised segmentation methods. Instead of being restricted to a fixed framework, TAR is a plug-and-play strategy that can be seamlessly integrated into various semi-supervised frameworks to further boost their performance.}
}


@InProceedings{pmlr-v315-golbaf26a,
  title = 	 {UM–ProtoShare: UNet-Guided, Multi-scale Shared Prototypes for Interpretable Brain Tumour Classification Using Multi-sequence 3D MRI},
  author =       {Golbaf, Ali and Singh, Vivek and Gaudl, Swen and Ifeachor, Emmanuel},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {637--669},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/golbaf26a/golbaf26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/golbaf26a.html},
  abstract = 	 {Deep learning shows strong promise in brain tumour classification using Magnetic Resonance Imaging (MRI), although limited interpretability constrains clinical translation. Most interpretability methods are post-hoc and yield visual attribution maps that are only weakly connected to the decision process. Clinicians prefer decisions built from evidence they can recognise and verify on MRI, rather than post-hoc explanations. Case-based models embed reasoning by comparing image evidence with learned prototypes, yielding “this looks like that” rationales at decision time and mirroring clinical reasoning. Building on this paradigm, we introduce UM–ProtoShare, which compares the input multi-sequence 3D brain MRI with a bank of shared, class-agnostic, multi-scale prototypes for pre-operative glioma grading. It returns not only a label, but a set of prototype matches that highlight where the model found support for its prediction. UM–ProtoShare uses a 3D ResNet-152 encoder, a lightweight UNet–style decoder with gated encoder–decoder fusions, and a normalised soft-masked mapping module to align and highlight prototype evidence on MRI. On BraTS-2020, ablations show additive benefits from the normalised mapping module, prototype sharing, multi-scale prototypes, and the decoder with gated fusions. Varying the allocation of prototypes across scales identifies a balanced accuracy–interpretability configuration that closely approaches a strong 3D ResNet-152 in classification performance (Balanced Accuracy: 88.40 $\pm$ 2.80; 1.48 percentage points lower) while delivering more faithful and spatially precise evidence than prior case-based models, with Activation Precision (AP) 88.72 $\pm$ 1.60 ($+$11.0% vs MProtoNet; $+$4.0% vs MAProtoNet) and Incremental Deletion Score (IDS) 5.10 $\pm$ 1.30 (lower is better, $-$32.3% vs MProtoNet, $-$25.3% vs MAProtoNet).}
}


@InProceedings{pmlr-v315-dai26a,
  title = 	 {Quantitative Pose-Based Analysis of Movement Disorders in Pediatric NGLY1 and SLC13A5 Patients},
  author =       {Dai, Chengliang and Scordis, Phil and Teeyagura, Prathyusha and Solidum, Rayann M. and Broderick, Jeff and Broderick, Julia and Broderick, Jane and Porter, Brenda E.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {670--684},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/dai26a/dai26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/dai26a.html},
  abstract = 	 {Movement disorders have long relied on subjective clinical observation for diagnosis and monitoring. By contrast, computer vision tools such as OpenPose can turn video recordings into precise, time-resolved measurements of a patient’s posture and movement. In this work, we apply a fully markerless, pose-based pipeline to classify abnormal movements in children with NGLY1 or SLC13A5 mutations. Our primary focus is on simple, physician-informed pose features that can be interpreted in clinical terms and used with conventional classifiers (Random Forest, SVM, etc.) on a very small dataset. We show that these handcrafted features capture clinically meaningful differences between movement-disorder phenotypes and can achieve useful classification performance. In addition, we include an exploratory comparison with a transformer model that is pre-trained on large-scale action-recognition data and then fine-tuned on our pose data. This experiment illustrates the potential performance ceiling of deep learning with extensive pretraining, but we emphasize that such models are less transparent and more data-hungry than the traditional approaches that form the core contribution of this study.}
}


@InProceedings{pmlr-v315-mgboh26a,
  title = 	 {FluenceFormer: Transformer-Driven Multi-Beam Fluence Map Regression for Radiotherapy Planning},
  author =       {Mgboh, Ujunwa and Sultan, Rafi Ibn and Kim, Joshua and Thind, Kundan and Zhu, Dongxiao},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {685--700},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/mgboh26a/mgboh26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/mgboh26a.html},
  abstract = 	 {Fluence map prediction is central to automated radiotherapy planning but remains an ill-posed inverse problem due to the complex relationship between volumetric anatomy and beam-intensity modulation. Convolutional methods in prior work often struggle to capture long-range dependencies, which can lead to structurally inconsistent or physically unrealizable plans. We introduce FluenceFormer, a backbone-agnostic transformer framework for direct, geometry-aware fluence regression. The model uses a unified two-stage design: Stage 1 predicts a global dose prior from anatomical inputs, and Stage 2 conditions this prior on explicit beam geometry to regress physically calibrated fluence maps. Central to the approach is the Fluence-Aware Regression (FAR) loss, a physics-informed objective that integrates voxel-level fidelity, gradient smoothness, structural consistency, and beam-wise energy conservation. We evaluate the generality of the framework across multiple transformer backbones, including Swin UNETR, UNETR, nnFormer, and MedFormer, using a prostate IMRT dataset. FluenceFormer with Swin UNETR achieves the strongest performance among the evaluated models and improves over existing benchmark CNN and single-stage methods, reducing Energy Error to $\mathbf{4.5%}$ and yielding statistically significant gains in structural fidelity ($p < 0.05$).}
}


@InProceedings{pmlr-v315-li26d,
  title = 	 {Synthetic Vasculature and Pathology Enhance Vision-Language Model Reasoning},
  author =       {Li, Chenjun and Wan, Cheng and Lux, Laurin and Berger, Alexander H. and Rosen, Richard B. and Menten, Martin J. and Paetzold, Johannes C.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {701--725},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/li26d/li26d.pdf},
  url = 	 {https://proceedings.mlr.press/v315/li26d.html},
  abstract = 	 {Vision-language models (VLMs) offer a promising path toward interpretable medical diagnosis by allowing users to ask about clinical explanations alongside predictions and across different modalities. However, training VLMs for detailed reasoning requires large-scale image-text datasets. In many specialized domains, for example in reading optical coherence tomography angiography (OCTA) images, such precise text with grounded description of pathologies is scarce or even non-existent. To overcome this bottleneck, we introduce synthetic vasculature reasoning (SVR), a framework that controllably synthesizes images and corresponding text, specifically: realistic retinal vasculature with diabetic retinopathy (DR) features: capillary dropout, microaneurysms, intraretinal microvascular abnormalities, and tortuosity, while automatically generating granular reasoning texts. Based on this we curate OCTA-100K-SVR, an OCTA image-reasoning dataset with 100,000 pairs. Our experiments show that a general-purpose VLM (Qwen3-VL-8b) trained on the dataset achieves a zero-shot balanced classification accuracy of 86.69% on real OCTA images, demonstrating performance comparable to supervised baselines. Through human expert evaluation we also demonstrate that it significantly enhances explanation quality and pathology localization on clinical data.}
}


@InProceedings{pmlr-v315-veenboer26a,
  title = 	 {TAP-CT: 3D Task-Agnostic Pretraining of Computed Tomography Foundation Models},
  author =       {Veenboer, Tim and Yiasemis, George and Marcus, Eric and van Veldhuizen, Vivien and Snoek, Cees G. M. and Teuwen, Jonas and Groot Lipman, Kevin B. W.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {726--753},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/veenboer26a/veenboer26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/veenboer26a.html},
  abstract = 	 {Existing foundation models (FMs) in the medical domain often require extensive fine-tuning or rely on training resource-intensive decoders, while many existing encoders are pretrained with objectives biased toward specific tasks. This illustrates a need for a strong, task-agnostic foundation model that requires minimal fine-tuning beyond feature extraction. In this work, we introduce a suite of task-agnostic pretraining of CT foundation models (TAP-CT): a simple yet effective adaptation of Vision Transformers (ViTs) and DINOv2 for volumetric data, enabling scalable self-supervised pretraining directly on 3D CT volumes. Our approach incorporates targeted modifications to patch embeddings, positional encodings, and volumetric augmentations, making the architecture depth-aware while preserving the simplicity of the underlying architectures. We show that large-scale 3D pretraining on an extensive in-house CT dataset (105K volumes) yields stable, robust frozen representations that generalize strongly across downstream tasks. To promote transparency and reproducibility, and to establish a powerful, low-resource baseline for future research in medical imaging, we will release all pretrained models, experimental configurations, and downstream benchmark code at .}
}


@InProceedings{pmlr-v315-moriakov26a,
  title = 	 {Conditional Learned Reconstruction for Medical Imaging},
  author =       {Moriakov, Nikita and Yiasemis, George and Sonke, Jan-Jakob and Teuwen, Jonas},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {754--780},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/moriakov26a/moriakov26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/moriakov26a.html},
  abstract = 	 {Medical imaging utilizes a handful of different imaging modalities such as tomography and magnetic resonance (MRI) imaging that require solving an inverse problem to reconstruct an image from the acquired measurements. Reconstruction methods based on learned iterative schemes have been widely explored recently, however, these modalities involve variability in hardware- and protocol-dependent acquisition parameters such as tube current and projection count in case of tomography and acceleration factor or field strength in case of MRI, which are typically not accounted for in the architecture. In this work we propose the framework of conditional learned iterative schemes, where the network weights are explicitly adapted as learned functions of the acquisition parameters. We compare conditional learned iterative schemes to their counterparts without conditioning for both tomography and MRI and demonstrate their effectiveness.}
}


@InProceedings{pmlr-v315-weidner26a,
  title = 	 {Brain Tumor Growth Inversion via Differentiable Neural Surrogates},
  author =       {Weidner, Jonas and Zimmer, Lucas and Ezhov, Ivan and Balcerak, Michal and Menze, Bj{\"o}rn and R{\"u}ckert, Daniel and Wiestler, Benedikt},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {781--800},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/weidner26a/weidner26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/weidner26a.html},
  abstract = 	 {Personalizing biophysical brain tumor models to individual patients is computationally expensive due to the need for numerous iterative evaluations of slow numerical solvers to identify optimal patient-specific parameters. We address this by introducing a differentiable neural surrogate that replaces the traditional forward model. Unlike the original solver, this surrogate is fully differentiable, allowing us to solve the inverse problem using highly efficient gradient-based optimization. This approach ensures that the solution learns the biophysical constraints of tumor growth while accelerating the process by orders of magnitude. In a 3D brain tumor growth setting, our framework achieves accuracy competitive with classical optimization while reducing runtime from days to seconds. Crucially, we demonstrate that our method, though trained on synthetic data, generalizes effectively to real patient scans. These findings establish differentiable surrogates as a powerful tool for accelerating scientific machine learning in medical imaging and beyond.}
}


@InProceedings{pmlr-v315-charisiadis26a,
  title = 	 {ICL-NoiseUNet - A Novel In-Context Learning Based Framework For Ultrasound Segmentation With Adaptive Noise Modulation},
  author =       {Charisiadis, Ioannis and el Allali, Ilyass and Lopata, Richard G. P. and S{\'a}nchez, Clara I. and Awasthi, Navchetan},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {801--826},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/charisiadis26a/charisiadis26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/charisiadis26a.html},
  abstract = 	 {The complex patterns, artifacts and speckle noise that are present in ultrasound images make precise segmentation very challenging. Existing approaches, such as convolutional neural network architectures and foundation models, have shown promising results across a wide range of tasks. However, they struggle to adapt to the unique characteristics of ultrasound data, leading to poor delineation of anatomical boundaries. For that reason, we propose ICL-NoiseUNet, an in-context-learning segmentation framework that combines guidance from a set of input-output pairs, called the context set, with analytic noise descriptors. More specifically, the model leverages an In-Context Feature Conditioning (ICFC) module to incorporate context examples and a Noise Modulation Block (NMB) that adapts feature representation to ultrasound characteristics. After extensive evaluation across several datasets, ICL-NoiseUNet consistently outperforms state-of-the-art methods, enhancing the segmentation quality. Moreover, ablation studies confirm the synergy effect of contextual conditioning and noise modulation. Overall, these findings pave the way for noise-guided ultrasound segmentation. The code will be open-source at .}
}


@InProceedings{pmlr-v315-sonmezer26a,
  title = 	 {WristMIR: Coarse-to-Fine Region-Aware Retrieval of Pediatric Wrist Radiographs with Radiology Report-Driven Learning},
  author =       {Sonmezer, Mert and Vasylechko, Serge and Atasoy, Duygu and Ertekin, Seyda and Kurugol, Sila},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {827--846},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/sonmezer26a/sonmezer26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/sonmezer26a.html},
  abstract = 	 {Retrieving wrist radiographs with analogous fracture patterns is challenging because clinically important cues are subtle, highly localized and often obscured by overlapping anatomy or variable imaging views. Progress is further limited by the scarcity of large, well-annotated datasets for case-based medical image retrieval. We introduce WristMIR, a region-aware pediatric wrist radiograph retrieval framework that leverages dense radiology reports and bone-specific localization to learn fine-grained, clinically meaningful image representations without any manual image-level annotations. Using MedGemma-based structured report mining to generate both global and region-level captions, together with pre-processed wrist images and bone-specific crops of the distal radius, distal ulna, and ulnar styloid, WristMIR jointly trains global and local contrastive encoders and performs a two-stage retrieval process: (1) coarse global matching to identify candidate exams, followed by (2) region-conditioned reranking aligned to a predefined anatomical bone region. WristMIR improves retrieval performance over strong vision-language baselines, raising image-to-text Recall@5 from 0.82% to 9.35%. Its embeddings also yield stronger fracture classification (AUROC 0.949, AUPRC 0.953). In region-aware evaluation, the two-stage design markedly improves retrieval-based fracture diagnosis, increasing mean $F_1$ from 0.568 to 0.753, and radiologists rate its retrieved cases as more clinically relevant, with mean scores rising from 3.36 to 4.35. These findings highlight the potential of anatomically guided retrieval to enhance diagnostic reasoning and support clinical decision-making in pediatric musculoskeletal imaging.}
}


@InProceedings{pmlr-v315-mao26a,
  title = 	 {RandP: Effective and Efficient Medical Visual In-Context Learning via a Retrieve-and-Propagate Module for Prompt-Query Fusion},
  author =       {Mao, Rongge and Li, Han and Dong, Chengqi and Navab, Nassir and Zhou, S Kevin},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {847--867},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/mao26a/mao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/mao26a.html},
  abstract = 	 {Visual In-Context Learning (ICL) has emerged as a promising paradigm for constructing vision generalists by conditioning on prompt pairs. Existing visual ICL methods typically adopt a grid-like prompt-query construction combined with Masked Image Modeling (MIM) as the training strategy. However, directly applying these frameworks to medical imaging tasks often leads to suboptimal performance. Moreover, the reliance on MIM restricts the backbone to Vision Transformer (ViT) and introduces unnecessary computational overhead due to the need to reconstruct the prompt label. In this work, we revisit previous visual ICL paradigms for medical imaging and propose a training-inference aligned masking strategy to replace MIM. We further introduce a Retrieve-and-Propagate (RandP) module to enhance prompt-query fusion under this masking scheme. Experimental results show that our RandP visual ICL framework not only doubles the inference speed compared to prior visual ICL baselines but also achieves superior performance across multiple medical imaging tasks. Furthermore, unlike previous approaches constrained to vanilla ViT, our framework is compatible with U-Net-style architectures, enabling broader applicability and improved effectiveness in the medical imaging domain. Our code will be available.}
}


@InProceedings{pmlr-v315-srivastava26a,
  title = 	 {CWCD: Category-Wise Contrastive Decoding for Structured Medical Report Generation},
  author =       {Srivastava, Shantam and Bhosale, Mahesh and Doermann, David and Gao, Mingchen},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {868--893},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/srivastava26a/srivastava26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/srivastava26a.html},
  abstract = 	 {Interpreting chest X-rays is inherently challenging due to the overlap between anatomical structures and the subtle presentation of many clinically significant pathologies, making accurate diagnosis time-consuming even for experienced radiologists. Recent radiology-focused foundation models, such as LLaVA-Rad and Maira-2, have positioned multi-modal large language models (MLLMs) at the forefront of automated radiology report generation (RRG). However, despite these advances, current foundation models generate reports in a single forward pass. This decoding strategy diminishes attention to visual tokens and increases reliance on language priors as generation proceeds, which in turn introduce spurious pathology co-occurrences in the generated reports. To mitigate these limitations, we propose Category-Wise Contrastive Decoding (CWCD), a novel and modular framework designed to enhance structured radiology report generation (SRRG). Our approach introduces category-specific parameterization and generates category-wise reports by contrasting normal X-rays with masked X-rays using category-specific visual prompts. Experimental results demonstrate that CWCD consistently outperforms baseline methods across both clinical efficacy and natural language generation metrics. An ablation study further elucidates the contribution of each architectural component to overall performance.}
}


@InProceedings{pmlr-v315-ertl26a,
  title = 	 {nnLandmark: A Self-Configuring Method for 3D Medical Landmark Detection},
  author =       {Ertl, Alexandra and Denner, Stefan and Peretzke, Robin and Xiao, Shuhan and Zimmerer, David and Fischer, Maximilian and Bujotzek, Markus and Yang, Xin and Neher, Peter and Isensee, Fabian and Maier-Hein, Klaus H.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {894--927},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/ertl26a/ertl26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/ertl26a.html},
  abstract = 	 {Landmark detection is central to many medical applications, such as identifying critical structures for treatment planning or defining control points for biometric measurements. However, manual annotation is labor-intensive and requires expert anatomical knowledge. While deep learning shows promise in automating this task, fair evaluation and interpretation of methods in a broader context, are hindered by limited public benchmarking, inconsistent baseline implementations, and non-standardized experimentation. To overcome these pitfalls, we present nnLandmark, a self-configuring framework for 3D landmark detection that combines tailored heatmap generation, loss design, inference logic, and a robust set of hyperparameters for heatmap regression, while reusing components from nnU-Net’s underlying self-configuration and training engine. nnLandmark achieves state-of-the-art performance across five public and one private dataset, benchmarked against three recently published methods. Its out-of-the-box usability enables training strong landmark detection models on new datasets without expert knowledge or dataset-specific hyperparameter tuning. Beyond accuracy, nnLandmark provides both a strong, common baseline and a flexible, standardized environment for developing and evaluating new methodological contributions. It further streamlines evaluation across multiple datasets by offering data conversion utilities for current public benchmarks. Together, these properties position nnLandmark as a central tool for advancing 3D medical landmark detection through systematic, transparent benchmarking, enabling to genuinely measure methodological progress.}
}


@InProceedings{pmlr-v315-duelmer26a,
  title = 	 {UltraG-Ray: Physics-Based Gaussian Ray Casting for Novel Ultrasound View Synthesis},
  author =       {Duelmer, Felix and Klaushofer, Jakob and Wysocki, Magdalena and Navab, Nassir and Azampour, Mohammad Farid},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {928--946},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/duelmer26a/duelmer26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/duelmer26a.html},
  abstract = 	 {Novel view synthesis (NVS) in ultrasound has gained attention as a technique for generating anatomically plausible views beyond the acquired frames, offering new capabilities for training clinicians or data augmentation. However, current methods struggle with complex tissue and view-dependent acoustic effects. Physics-based NVS aims to address these limitations by including the ultrasound image formation process into the simulation. Recent approaches combine a learnable implicit scene representation with an ultrasound-specific rendering module, yet a substantial gap between simulation and reality remains. In this work, we introduce UltraG-Ray, a novel ultrasound scene representation based on a learnable 3D Gaussian field, coupled to an efficient physics-based module for B-mode synthesis. We explicitly encode ultrasound-specific parameters, such as attenuation and reflection, into a Gaussian-based spatial representation and realize image synthesis within a novel ray casting scheme. In contrast to previous methods, this approach naturally captures view-dependent attenuation effects, thereby enabling the generation of physically informed B-mode images with increased realism. We compare our method to state-of-the-art and observe consistent gains in image quality metrics (up to 15% increase on MS-SSIM), demonstrating clear improvement in terms of realism of the synthesized ultrasound images.}
}


@InProceedings{pmlr-v315-paranjape26a,
  title = 	 {CatVLM: Enhancing Temporal Understanding in Cataract Surgery Videos with Boundary-Aware VLM},
  author =       {Paranjape, Jay N. and Shah, Nisarg and Narayanan, Nanthini and Sikder, Shameema and Vedula, S. Swaroop and Patel, Vishal M.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {947--959},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/paranjape26a/paranjape26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/paranjape26a.html},
  abstract = 	 {Recent studies have shown the effectiveness of Vision Language Models (VLMs) for understanding and analyzing videos in the medical domain and supporting various Question-Answer (QA) tasks. Yet, current VLMs fall short in addressing queries that require temporal reasoning—a critical capability for surgical video understanding. In this work, we introduce CatVLM, a boundary-aware VLM, designed to capture temporal dynamics in untrimmed cataract surgery videos. CatVLM is capable of performing three clinically relevant tasks that demand moment-level awareness: Video Moment Retrieval (VMR), Video Captioning (VC), and Counting. To facilitate the training of such a model, we generate a bank of QA annotations for each task and propose a method to integrate video clips with the timestamps they occur. To the best of our knowledge, this work is one of the first approaches to explicitly incorporate temporal boundary awareness into VLMs for cataracts as well as the medical domain. We evaluate CatVLM on two public cataract surgery datasets, establishing new baselines across all three tasks. All the code, model checkpoints and annotations will be released post-review}
}


@InProceedings{pmlr-v315-atad26a,
  title = 	 {Hide-and-Seek Attribution: Weakly Supervised Segmentation of Vertebral Metastases in CT},
  author =       {Atad, Matan and Marka, Alexander W. and Steinhelfer, Lisa and Curto-Vilalta, Anna and Leonhardt, Yannik and Foreman, Sarah C. and Dietrich, Anna-Sophia Walburga and Graf, Robert and Gersing, Alexandra S. and Menze, Bjoern and Rueckert, Daniel and Kirschke, Jan S. and Moeller, Hendrik},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {960--988},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/atad26a/atad26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/atad26a.html},
  abstract = 	 {Accurate segmentation of vertebral metastasis in CT is clinically important yet difficult to scale, as voxel-level annotations are scarce and both lytic and blastic lesions often resemble benign degenerative changes. We introduce a 2D weakly supervised method trained solely on vertebra-level healthy/malignant labels, without any lesion masks. The method combines a Diffusion Autoencoder (DAE) that produces a classifier-guided healthy edit of each vertebra with pixel-wise difference maps that propose suspect candidate lesions. To determine which regions truly reflect malignancy, we introduce Hide-and-Seek Attribution: each candidate is revealed in turn while all others are hidden, the edited image is projected back to the data manifold by the DAE, and a latent-space classifier quantifies the isolated malignant contribution of that component. High-scoring regions form the final lytic or blastic segmentation. On held-out radiologist annotations, we achieve strong blastic/lytic performance despite no mask supervision (F1: 0.91/0.85; Dice: 0.87/0.78), exceeding baselines (F1: 0.79/0.67; Dice: 0.74/0.55). These results show that vertebra-level labels can be transformed into reliable lesion masks, demonstrating that generative editing combined with selective occlusion supports accurate weakly supervised segmentation in CT.}
}


@InProceedings{pmlr-v315-shi26a,
  title = 	 {SGDC: Structurally-Guided Dynamic Convolution for Medical Image Segmentation},
  author =       {Shi, Bo and Zhu, Wei-ping and Swamy, M.N.S},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {989--1003},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/shi26a/shi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/shi26a.html},
  abstract = 	 {Spatially variant dynamic convolution provides a principled approach of integrating spatial adaptivity into deep neural networks. However, mainstream designs in medical segmentation commonly generate dynamic kernels through average pooling, which implicitly collapses high-frequency spatial details into a coarse, spatially-compressed representation, leading to over-smoothed predictions that degrade the fidelity of fine-grained clinical structures. To address this limitation, we propose a novel Structure-Guided Dynamic Convolution (SGDC) mechanism, which leverages an explicitly supervised structure-extraction branch to guide the generation of dynamic kernels and gating signals for structure-aware feature modulation. Specifically, the high-fidelity boundary information from this auxiliary branch is fused with semantic features to enable spatially-precise feature modulation. By replacing context aggregation with pixel-wise structural guidance, the proposed design effectively prevents the information loss introduced by average pooling. Experimental results show that SGDC achieves state-of-the-art performance on ISIC 2016, PH2, ISIC 2018, and CoNIC datasets, delivering superior boundary fidelity by reducing the Hausdorff Distance (HD95) by 2.05, and providing consistent IoU gains of 0.99%-1.49% over pooling-based baselines. Moreover, the mechanism exhibits strong potential for extension to other fine-grained, structure-sensitive vision tasks, such as small-object detection, offering a principled solution for preserving structural integrity in medical image analysis.}
}


@InProceedings{pmlr-v315-wu26b,
  title = 	 {Guideline-Informed MLLM Reasoning for Pathology-Aware Postoperative Prostate CTV Segmentation},
  author =       {Wu, Yinhao and Zhao, Hengrui and Li, Haiqing and Zhong, Wenliang and Ma, Hehuan and Guo, Yuzhi and Nguyen, Dan and Yang, Daniel and Jiang, Steve and Huang, Junzhou},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1004--1028},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wu26b/wu26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wu26b.html},
  abstract = 	 {Accurate segmentation of the Clinical Target Volume (CTV) is a critical prerequisite for precise radiotherapy planning, pursuing complete irradiation of microscopic disease while minimizing toxicity to surrounding healthy organs. However, achieving automated CTV segmentation remains highly challenging due to the invisible microscopic disease on planning CT and the necessity of incorporating clinical context into delineation decisions. Unlike previous methods that rely solely on visual features or coarse global text reasoning, we propose ReaCT, a unified framework that reformulates CTV segmentation as a multimodal reasoning task by explicitly integrating pathological information with visual context. Specifically, we introduce a Guideline-Informed Attribute Extractor that follows the information-retrieval workflow of radiation oncologists. By distilling knowledge from clinical guidelines, this module filters and structures lengthy pathology reports into a concise set of clinically determinative pathological attributes, effectively bridging the semantic gap between unstructured clinical records and segmentation networks. Furthermore, we develop an Attribute-Specific MLLM Reasoner built upon a 3D residual U-Net that performs fine-grained spatial reasoning. By leveraging a sequence of attribute-specific query tokens, the model disentangles the distinct target implications of individual pathological attributes, enabling fine-grained anatomical alignment via multi-scale fusion using Two-Way Transformers. Experiments on a postoperative prostate cancer dataset demonstrate that ReaCT achieves state-of-the-art segmentation performance and exhibits strong robustness, with pronounced improvements under limited-annotation settings.}
}


@InProceedings{pmlr-v315-wang26d,
  title = 	 {Uncertainty-aware Cycle Diffusion Model for Fair Glaucoma Diagnosis},
  author =       {Wang, Ziheng and Yang, Shuran and Lin, Yan and Zang, Wenrui and Meng, Yanda},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1029--1043},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wang26d/wang26d.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wang26d.html},
  abstract = 	 {Fairness has become a critical ethical concern, particularly in AI-based healthcare applications. Data imbalance and limited sample size can lead to lower diagnostic performance. Consequently, this harms the fairness of AI when applied to real-world scenarios. Generative models, like diffusion models, offer a promising solution by generating diverse synthetic data to support underrepresented groups. This improves fairness and performance while mitigating privacy risks. We propose a shape-controlled framework that incorporates demographic information into an end-to-end diffusion model, along with an automatic selection strategy to identify overconfidently misclassified samples. These challenging samples are then augmented via the generative model to enhance its classification performance. The strategy also removes potentially misleading “lower-quality” synthetic samples. Two ophthalmic experts validated the clinical relevance and plausibility of our synthetic images through random external examination. Our method outperforms state-of-the-art methods on the Harvard-FairVLMed dataset in both fairness and diagnosis accuracy.}
}


@InProceedings{pmlr-v315-melacini26a,
  title = 	 {Flow Matching for 3D Craniofacial Skeletal Data Generation},
  author =       {Melacini, Giacomo and Mazzocchetti, Stefano and Lisanti, Giuseppe and Di Stefano, Luigi and Salti, Samuele},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1044--1064},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/melacini26a/melacini26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/melacini26a.html},
  abstract = 	 {In the medical domain, the use of Machine Learning (ML) techniques for diagnosis, treatment planning, and medical imaging interpretation is becoming increasingly important. However, these approaches require a large amount of data, which is challenging to access due to its sensitive nature and related privacy concerns. Synthetic data generation, enabled by advances in generative techniques, provides a solution to create large anonymized datasets for training models without compromising patient privacy. Recently, Flow Matching with Optimal Transport (OTFM) has proven to be an effective technique for generating realistic 2D natural images, surpassing existing methods, but its usage for 3D medical data generation is limited. In this work we generate craniofacial skeletal data using OTFM and test the validity of the results in two clinical downstream tasks: skull alignment and shape completion. Moreover, we compare the quality of synthetic data generated with OTFM with the ones generated using Denoising Diffusion Probabilistic Models (DDPMs). We show that Flow Matching with Optimal Transport is an effective technique for generating synthetic data and that, in this context, it outperforms DDPMs both in quality and robustness.}
}


@InProceedings{pmlr-v315-jong26a,
  title = 	 {Towards Effective Surgical Representation Learning with DINO Models},
  author =       {de Jong, Ronald L.P.D. and Li, Yiping and Jaspers, Tim J.M. and van Jaarsveld, Romy C. and Kuiper, Gino M. and Badaloni, Franco and van Hillegersberg, Richard and Ruurda, Jelle P. and van der Sommen, Fons and Pluim, Josien P.W. and Breeuwer, Marcel},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1065--1080},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/jong26a/jong26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/jong26a.html},
  abstract = 	 {Self-supervised learning (SSL) has emerged as a promising approach to address the limitations of annotated surgical datasets, which are often small, heterogeneous, and expensive to curate. Among SSL methods, self-distillation with no labels (DINO) has achieved state-of-the-art (SOTA) results in natural images, but its applicability to surgical data remains underexplored. In this work, we systematically investigate DINOv1, DINOv2, and DINOv3 for surgical representation learning. We pretrain these models on a large-scale surgical dataset of 4.7M video frames (SurgeNetXL) and evaluate their transferability on downstream tasks including semantic segmentation and surgical phase recognition. Our results demonstrate that in-domain pretraining consistently improves performance across all DINO variants, with DINOv2 and DINOv3 achieving SOTA performance. We further offer practical insights and visualizations highlighting the effectiveness of SSL. Finally, our study delivers ready-to-use DINO-based SSL models and pretraining protocols for surgical computer vision research, which are publicly available at: github.com/rlpddejong/SurgeNetDINO.}
}


@InProceedings{pmlr-v315-pinetz26a,
  title = 	 {Exploiting Intermediate Reconstructions in Optical Coherence Tomography for Test-Time Adaptation of Medical Image Segmentation},
  author =       {Pinetz, Thomas and Hucke, Veit and Bogunovi\'c, Hrvoje},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1081--1094},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/pinetz26a/pinetz26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/pinetz26a.html},
  abstract = 	 {Primary health care frequently relies on low-cost imaging devices, which are commonly used for screening purposes. To ensure accurate diagnosis, these systems depend on advanced reconstruction algorithms designed to approximate the performance of high-quality counterparts. Such algorithms typically employ iterative reconstruction methods that incorporate domain-specific prior knowledge. However, downstream task performance is generally assessed using only the final reconstructed image, thereby disregarding the informative intermediate representations generated throughout the reconstruction process. In this work, we propose IRTTA to exploit these intermediate representations at test-time by adapting the normalization-layer parameters of a frozen downstream network via a modulator network that conditions on the current reconstruction timescale. The modulator network is learned during test-time using an averaged entropy loss across all individual timesteps. Variation among the timestep-wise segmentations additionally provides uncertainty estimates at no extra cost. This approach enhances segmentation performance and enables semantically meaningful uncertainty estimation, all without modifying either the reconstruction process or the downstream model.}
}


@InProceedings{pmlr-v315-wei26a,
  title = 	 {MID-POSE: Multi-Instrument Detection and Pose Estimation in Endoscopic Surgery},
  author =       {Wei, Wenhua and Mennillo, Laurent and Mao, Zhehua and Wijekoon, Anjana and Feeny, Kendall and Khan, Danyal Zaman and Mazomenos, Evangelos B. and Stoyanov, Danail and Marcus, Hani J. and Bano, Sophia},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1095--1114},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wei26a/wei26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wei26a.html},
  abstract = 	 {Reliable perception of surgical instruments is a key prerequisite for intraoperative guidance, context-aware assistance, and workflow analysis in minimally invasive surgery (MIS). This is particularly challenging in skull base procedures, where narrow anatomical corridors, frequent occlusions, specular highlights, and visually similar instruments make multi-class detection and 2D pose estimation difficult. We address joint instrument detection and keypoint-based pose estimation from monocular endoscopic videos and introduce MID-POSE, a dual-head architecture that couples a high-resolution HRNetV2p encoder with a class-agnostic dense detection-pose head and a Multi-level Instrument Classification (MIC) head which operates on RoI-aligned multi-level features. To support this task, we construct the PitSurg dataset from 26 clinical procedures, providing seven instrument classes with bounding boxes and detailed 2D keypoints. Using YOLOv8x-pose as our strongest baseline, which in our tasks outperforms YOLO11x-pose, MID-POSE improves Det/Pose $AP_{50\text{–}95}$ on PitSurg from $59.4/63.1$ to $77.5/78.5$ and on the robotic SurgPose dataset from $47.9/61.1$ to $62.7/71.4$. Qualitative analysis shows that high-resolution features sharpen localisation and keypoint placement, while the RoI classifier reduces misclassifications and spurious background detections, indicating that the proposed architecture and dataset provide an effective basis for robust multi-instrument perception in MIS.}
}


@InProceedings{pmlr-v315-moustafa26a,
  title = 	 {Generalizing Abstention for Noise-Robust Learning in Medical Image Segmentation},
  author =       {Moustafa, Wesam and Elsafty, Hossam and Schneider, Helen and Sparrenberg, Lorenz and Sifa, Rafet},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1115--1136},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/moustafa26a/moustafa26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/moustafa26a.html},
  abstract = 	 {Label noise is a critical problem in medical image segmentation, often arising from the inherent difficulty of manual annotation. Models trained on noisy data are prone to overfitting, which degrades their generalization performance. While a number of methods and strategies have been proposed to mitigate noisy labels in the segmentation domain, this area remains largely under-explored. The abstention mechanism has proven effective in classification tasks by enhancing the capabilities of Cross Entropy, yet its potential in segmentation remains unverified. In this paper, we address this gap by introducing a universal and modular abstention framework capable of enhancing the noise-robustness of a diverse range of loss functions. Our framework improves upon prior work with two key components: an informed regularization term to guide abstention behaviour, and a more flexible power-law-based auto-tuning algorithm for the abstention penalty. We demonstrate the framework’s versatility by systematically integrating it with three distinct loss functions to create three novel, noise-robust variants: GAC, SAC, and ADS. Experiments on the CaDIS and DSAD medical datasets show our methods consistently and significantly outperform their non-abstaining baselines, especially under high noise levels. This work establishes that enabling models to selectively ignore corrupted samples is a powerful and generalizable strategy for building more reliable segmentation models.}
}


@InProceedings{pmlr-v315-friedetzki26a,
  title = 	 {Discriminative Self-Supervised Pre-Training for Esophagitis Detection in Upper GI Endoscopy Images},
  author =       {Friedetzki, Tobias and Chandraiah, Naveen and Svoboda, Emil and Pecina, Pavel and Puppe, Frank and Krenzer, Adrian},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1137--1152},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/friedetzki26a/friedetzki26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/friedetzki26a.html},
  abstract = 	 {Early and accurate detection of esophagitis in upper gastrointestinal endoscopy is essential for guiding targeted treatment and preventing progression to severe diseases such as esophageal cancer. Although deep learning methods have shown promise in supporting esophagitis diagnosis, their performance heavily relies on large amounts of labeled data, which are scarce. Consequently, supervised models often struggle to generalize to the high visual variability and subtle lesion differences encountered in real-world endoscopic examinations. In this work, we study discriminative self-supervised pre-training as a means of leveraging large-scale unlabeled data for robust representation learning. Multiple Vision Transformer models are pre-trained using the DINO framework on 395,201 unlabeled gastrointestinal endoscopy images and subsequently fine-tuned on a curated esophagitis dataset from three clinical centers. Our results demonstrate that self-supervised pre-training on in-domain endoscopic images significantly improves esophagitis detection performance compared to supervised pre-training on natural image datasets such as ImageNet. Specifically, in-domain DINO pre-training yields an average performance gain of 6.60 percentage points in AUPRC on the downstream detection task, with the best-performing model achieving an AUPRC of 89.82%. These findings highlight the importance of in-domain self-supervised learning for reducing annotation dependency and improving model robustness in upper GI endoscopy analysis.}
}


@InProceedings{pmlr-v315-weers26a,
  title = 	 {From Pixels to Histopathology: A Graph-Based Framework for Interpretable Whole Slide Image Analysis},
  author =       {Weers, Alexander and Berger, Alexander H. and Lux, Laurin and Sch\"uffler, Peter and Rueckert, Daniel and Paetzold, Johannes C.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1153--1177},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/weers26a/weers26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/weers26a.html},
  abstract = 	 {The histopathological analysis of whole-slide images (WSIs) is fundamental to cancer diagnosis but is a time-consuming and expert-driven process. While deep learning methods show promising results, dominant patch-based methods artificially fragment tissue, ignore biological boundaries, and produce black-box predictions. We overcome these limitations with a novel framework that transforms gigapixel WSIs into tissue-boundary aligned graph representations and is interpretable by design. Our approach builds graph nodes from tissue regions that respect natural structures, not arbitrary grids. We introduce an adaptive graph coarsening technique, guided by learned embeddings, to efficiently merge homogeneous regions while preserving diagnostically critical details in heterogeneous areas. Each node is enriched with a compact, interpretable feature set capturing clinically-motivated priors. A graph attention network then performs diagnosis on this compact representation. We demonstrate strong performance on cancer staging and survival prediction, outperforming methods with similar data requirements. Crucially, our data-efficient model (requiring $>300\times$ less training data) achieves results competitive with a massive foundation model, while offering full interpretability through feature attribution.}
}


@InProceedings{pmlr-v315-alameddin26a,
  title = 	 {MCMA-Net++: Topology-Aware and Graph-Driven Glioma Segmentation in 3D MRI},
  author =       {Alameddin, Jihan and Thomarat, C{\'e}line and Guillevin, R{\'e}my and Fernandez-Maloigne, Christine and Guillevin, Carole},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1178--1192},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/alameddin26a/alameddin26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/alameddin26a.html},
  abstract = 	 {Glioma segmentation in 3D MRI remains challenging due to tumor heterogeneity, intensity variability, and hierarchical anatomical structure. We propose MCMA-Net++, which synergistically combines hybrid CNN-Transformer encoding, graph-based spatial reasoning with anatomical priors, and a practical multi-component topology-aware refinement loss tailored for nested tumor subregions. Our framework integrates: (1) Topology-Aware Refinement Loss (TAR-Loss), enforcing consistency across nested subregions (ET, TC, WT), and (2) Multi-Scale Anatomical Graph Reasoning (MSAGR), modeling spatial dependencies through learnable graphs with anatomical priors. Combined with dual-stream CNN-Swin Transformer encoding and Multi-Class Multi-Attention, MCMA-Net++ achieves Dice scores of 0.970$\pm$0.003 (WT), 0.943$\pm$0.005 (TC), 0.926$\pm$0.008 (ET), reducing HD95 from 5.48 mm to 3.21 mm compared to MCMA-Net. Graph reasoning contributes +1.3% Dice for ET and TAR-Loss reduces topology violations by 41%. These results demonstrate the effectiveness of combining topology-guided refinement and anatomical graph reasoning for clinical-grade glioma segmentation.}
}


@InProceedings{pmlr-v315-chang26a,
  title = 	 {A Deep Learning-Enabled Digital Twin Framework for Fast Online Adaptive Proton Therapy: A Validation Study in A Prostate SBRT Clinical Application},
  author =       {Chang, Chih-Wei and Safari, Mojtaba and Akkineni, Sri Sai and Hu, Mingzhe and Shah, Keyur D. and Patel, Pretesh and Jani, Ashesh B. and Agasthya, Greeshma and Zhou, Jun and Yang, Xiaofeng},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1193--1216},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/chang26a/chang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/chang26a.html},
  abstract = 	 {Online adaptive radiotherapy offers substantial potential for improving treatment precision by accounting for daily anatomical variations, yet conventional replanning workflows remain time intensive and limit feasibility for hypofractionated treatments such as prostate stereotactic body radiation therapy (SBRT). This validation study demonstrates a deep learning enabled digital twin (DT) framework that leverages a VoxelMorph-based multi atlas deformable image registration pipeline to enable fast online adaptive proton therapy planning with dominant intraprostatic lesion (DIL) boost while achieving clinical equivalent plan quality with significantly reduced reoptimization time. The DT framework integrates deformable registration, daily cone beam CT (CBCT)-driven anatomical updates, and knowledge-based composite scoring functions, using an institutional database of 43 prostate SBRT patients with 215 CBCT scans totaling approximately 26,312 images to forecast interfractional variations and pre generate probabilistic treatment plans for new patients. Upon daily CBCT acquisition, the system enables rapid reoptimization using pre-computed plan conditions, and plan quality is evaluated using a ProKnow based scoring system that assesses target coverage and organ at risk sparing. Across all cases, the DT framework achieved an average reoptimization time of 5.5 $\pm 2.7$ minutes compared with $19.8 \pm 11.9$ minutes for clinical workflows, representing a 72 percent reduction, while producing optimal plans with a composite score of $157.2 \pm 5.6$ compared with $153.8 \pm 6.0$ for clinical plans. DT generated plans maintained high dosimetric quality, including DIL V100 of 99.5 percent $\pm 0.6$ percent, CTV V100 of 99.8 percent $\pm 0.2$ percent, and comparable sparing of organs at risk, such as bladder V20.8Gy of $11.4 \pm 4.2$ cm3, rectum V23Gy of $0.7 \pm 0.4$ cm3, and urethra D10 of 90.9 percent $\pm 2.3$ percent. These results demonstrate that deep learning enabled digital twins can substantially accelerate online adaptive proton therapy while preserving or enhancing plan quality, providing a clinically feasible pathway toward real time personalized radiotherapy for prostate SBRT with DIL boost.}
}


@InProceedings{pmlr-v315-mishra26a,
  title = 	 {Hyperbolic U-Net for Robust Medical Image Segmentation},
  author =       {Mishra, Swasti S. and van Spengler, Max and Berkhout, Erwin and Mettes, Pascal},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1217--1251},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/mishra26a/mishra26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/mishra26a.html},
  abstract = 	 {The U-Net architecture is a leading network in medical image segmentation. Despite its strong segmentation performance, U-Net struggles when dealing with noise in image data, such as random interference and brightness variations. While a common occurrence, the presence of random noise leads to strong performance degradation in U-Net, hampering its clinical integration and robustness. In this work, we investigate the role of geometry in U-Net. All U-Net variations share the same geometric foundations, namely Euclidean geometry. Here, we propose Hyperbolic U-Net, which maintains U-Net’s proven encoder-decoder structure while operating entirely in the Poincaré ball of hyperbolic space. We identify two main roadblocks for training a fully Hyperbolic U-Net and propose a solution for each: (i) fully hyperbolic literature has so far focused on encoders, limiting their applicability to segmentation. We introduce hyperbolic 2D transpose convolution and hyperbolic bilinear upsampling layers that make it possible to create decoders, and (ii) existing hyperbolic parameter initializations are not suitable for hyperbolic decoder blocks. We introduce a Newton’s approximation-scaled weight initialization, which ensures norm preservation for all layers at the start of training. Empirically, we show that our Hyperbolic U-Nets strongly outperform standard Euclidean U-Nets across multiple medical image datasets for Gaussian, Speckle, Poisson, and Rician noise, as well as to brightness and contrast shift. We conclude that a fully Hyperbolic U-Net is highly robust to out-of-the-box noise, without the need for denoising or additional objectives, highlighting the potential of hyperbolic geometry for medical imaging.}
}


@InProceedings{pmlr-v315-palaniappan26a,
  title = 	 {Vesselpose: Vessel Graph Reconstruction from Learned Voxel-wise Direction Vectors in 3D Vascular Images},
  author =       {Palaniappan, Rajalakshmi and Karg, Christoph and Navarro-Arambula, Nemesio and Hirsch, Peter and Kainmueller, Dagmar and Mais, Lisa},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1252--1284},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/palaniappan26a/palaniappan26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/palaniappan26a.html},
  abstract = 	 {Blood vessel segmentation and -tracing are essential tasks in many medical imaging applications. Although numerous methods exist, the prevailing segment-then-fix paradigm is fundamentally limited regarding its suitability for modelling the task of complete and topologically accurate vascular network reconstruction. We here propose an approach to extract topologically more accurate vascular graphs from 3D image data, building upon highly successful ideas from the related biomedical tasks of cell segmentation and -tracking. Our approach first predicts voxel-wise vessel direction vectors joint with standard vessel segmentation masks. Second, to extract the vascular graph from these predictions, we introduce a direction-vector-guided extension of the TEASAR algorithm. Our approach achieves state-of-the-art performance on three benchmark datasets, spanning both synthetic and real imagery. We further demonstrate the applicability of our approach to challenging 3D micro-CT scans of rat heart vasculature. Finally, we propose meaningful and interpretable measures of topological error, namely false splits and false merges for graphs. Overall, our approach substantially improves the topological accuracy of reconstructed vascular graphs, being able to separate closely apposed vessel segments and handle multiple vascular trees within a single volume.}
}


@InProceedings{pmlr-v315-yiasemis26a,
  title = 	 {End-to-End Co-Optimization of Adaptive $k$-space Sampling and Reconstruction for Dynamic MRI},
  author =       {Yiasemis, George and Sonke, Jan-Jakob and Teuwen, Jonas},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1285--1324},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/yiasemis26a/yiasemis26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/yiasemis26a.html},
  abstract = 	 {Accelerating dynamic MRI is essential for advancing clinical imaging and improving patient comfort. Most deep learning methods for dynamic MRI reconstruction rely on predetermined or random subsampling patterns that are uniformly applied across all temporal frames. Such strategies ignore temporal correlations and fail to optimize sampling for individual cases. To address this, we propose E2E-ADS-Recon, an end-to-end framework for adaptive dynamic MRI subsampling and reconstruction. The framework integrates an Adaptive Dynamic Sampler (ADS), which generates case-specific sampling patterns for a given acceleration factor, with a dynamic MRI reconstruction network that reconstructs the adaptively sampled data into a dynamic image sequence. The ADS can produce either frame-specific or unified patterns across time frames. We evaluate the method on multi-coil cardiac cine MRI data under both 1D and 2D sampling settings and compare it with standard and optimized non-adaptive baselines. E2E-ADS-Recon achieves superior reconstruction quality, particularly at higher acceleration rates. These results highlight the benefit of case-specific adaptive sampling and demonstrate the potential of joint sampling–reconstruction optimization for dynamic MRI. Code and trained models will be made publicly available upon acceptance.}
}


@InProceedings{pmlr-v315-marikkar26a,
  title = 	 {Domain Adaptation Without the Compute Burden for Efficient Whole Slide Image Analysis},
  author =       {Marikkar, Umar and Awais, Muhammad and Atito, Sara},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1325--1345},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/marikkar26a/marikkar26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/marikkar26a.html},
  abstract = 	 {Computational methods on analyzing Whole Slide Images (WSIs) enable early diagnosis and treatments by supporting pathologists in detection and classification of tumors. However, the extremely high resolution of WSIs makes end-to-end training impractical compared to typical image analysis tasks. To address this, most approaches use pre-trained feature extractors to obtain fixed representations of whole slides, which are then combined with Multiple Instance Learning (MIL) for downstream tasks. These feature extractors are typically pre-trained on natural image datasets such as ImageNet, which fail to capture domain-specific characteristics. Although domain-specific pre-training on histopathology data yields more relevant feature representations, it remains computationally expensive and fail to capture task-specific characteristics within the domain. To address the computational cost and lack of task-specificity in domain-specific pre-training, we propose EfficientWSI (eWSI), a careful integration of Parameter-Efficient-Fine-Tuning (PEFT) and Multiple Instance Learning (MIL) that enables end-to-end training on WSI tasks. We evaluate eWSI on seven WSI-level tasks over Camelyon16, TCGA and BRACS datasets. Our results show that eWSI when applied with ImageNet feature extractors yields strong classification performance, matching or outperforming MILs with in-domain feature extractors, alleviating the need for extensive in-domain pre-training. Furthermore, when eWSI is applied with in-domain feature extractors, it further improves classification performance in most cases, demonstrating its ability to capture task-specific information where beneficial. Our findings suggest that eWSI provides a task-targeted, computationally efficient path for WSI tasks, offering a promising direction for task-specific learning in computational pathology.}
}


@InProceedings{pmlr-v315-stolte26a,
  title = 	 {BETA: Resting-state fMRI Biotypes for tDCS Efficacy in Anxiety Among Older Adults At Risk For Alzheimer’s Disease},
  author =       {Stolte, Skylar E. and Cheng, Junfu and Acharya, Chintan and Gu, Lin and O'Shea, Andrew and Indahlastari, Aprinda and Woods, Adam J. and Fang, Ruogu},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1346--1374},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/stolte26a/stolte26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/stolte26a.html},
  abstract = 	 {Anxiety is usually gauged by self-report, yet a single symptom level can reflect disparate neural circuitry. In Alzheimer’s disease and related dementias (ADRD) this heterogeneity becomes a barrier to effective neuromodulation: some patients may benefit from transcranial direct-current stimulation (tDCS), while others may not. To overcome this obstacle, we introduced BETA (Biotypes for tDCS Efficacy in Anxiety), a data-driven pipeline that uses resting-state fMRI functional connectivity to derive anxiety subtypes that are intrinsically linked to tDCS response. A transformer-based variational autoencoder compresses high-dimensional connectivity into a 50-dimensional latent embedding that emphasizes networks implicated in cognitive aging and anxiety. A deep-embedded clustering loss, regularized by a clinically informed term that pulls together individuals who exhibit similar post-tDCS anxiety change, yields four distinct subtypes. Across all subtypes, disrupted coupling between sensory-processing and higher-order cognitive regions emerges as a common hallmark. Crucially, one cluster is resistant to frontal-lobe tDCS, whereas two clusters demonstrate significant anxiety reduction following stimulation. The responsive subtypes are defined by strengthened connectivity between the lateral occipital cortex—superior division (sLOC) and medial frontal cortex (MedFC), and between sLOC and the intracalcarine cortex (ICC). BETA demonstrates that fMRI-based subtyping can directly identify which patients are likely to benefit from tDCS, providing a concrete roadmap for precision psychiatry in ADRD and facilitating tailored therapeutic strategies for anxiety.}
}


@InProceedings{pmlr-v315-aas-alas26a,
  title = 	 {Does Grounding Improve Radiology Report Generation? An Empirical Study on PadChest-GR},
  author =       {Aas-Alas, Mohamed and Albiol, Alberto and Paredes, Roberto},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1375--1391},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/aas-alas26a/aas-alas26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/aas-alas26a.html},
  abstract = 	 {Radiology Report Generation (RRG) aims to automatically produce clinically accurate descriptions of medical images, yet current models often struggle with incomplete findings, generic phrasing, and hallucinations due to the absence of explicit grounding signals. To address these limitations, we propose a grounding-based RRG framework that integrates spatially localized visual evidence into the generation process. Our approach combines a vision encoder ViT with a language decoder LLM GPT-2 through a lightweight transformer-based bridging module inspired by Bridge-Enhanced Vision Encoder–Decoder (VED) architectures. Grounding is introduced using bounding boxes of anatomical regions and pathologies, enabling the model to attend to both global and localized features. We further define a adopt the region-to-text task, where the model generates findings directly from specific regions of interest. Experiments on the PadChest-GR dataset demonstrate that grounding substantially improves linguistic quality and clinical accuracy, with the full image plus grounding mask configuration achieving the strongest gains across BLEU, ROUGE-L, CIDEr, BERTScore, CheXbert F1, and RadGraph F1. Analyses also show that even partial or noisy grounding yields consistent benefits.}
}


@InProceedings{pmlr-v315-ben-atya26a,
  title = 	 {Bootstrapped Physically-Primed Neural Networks for Robust T2 Distribution Estimation in Low-SNR Pancreatic MRI},
  author =       {Ben Atya, Hadas and Abramenkov, Nicole and Mashiah, Noa and Brock, Luise and Link Sourani, Daphna and Weiss, Ram and Freiman, Moti},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1392--1406},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/ben-atya26a/ben-atya26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/ben-atya26a.html},
  abstract = 	 {Estimating multi-component $T_{2}$ relaxation distribution from Multi-Echo Spin Echo (MESE) MRI is a severely ill-posed inverse problem, traditionally approached with regularized non-negative least squares (NNLS). In abdominal imaging, and in the pancreas in particular, low Signal-to-Noise Ratio (SNR), and residual uncorrelated noise between reconstructed echoes challenge both classical solvers and deterministic deep learning models. We introduce a bootstrap-based inference framework for robust distributional $T_2$ estimation, which performs stochastic resampling of the echo train and aggregates predictions across multiple echo subsets. This strategy treats the acquisition as a distribution rather than a fixed input, yielding variance-reduced, physically consistent estimates and converting deterministic relaxometry networks into probabilistic ensemble predictors. Building on the P2T2 architecture, our method applies inference-time bootstrapping to smooth residual noise artifacts, increase tolerance to stochastic inference errors, and enhance fidelity to the underlying relaxation distribution. We demonstrate a clinical application of the proposed approach for functional and physical assessment of the pancreas. Currently available techniques for noninvasive pancreatic evaluation are limited due to the organ’s concealed retroperitoneal location and the procedural risks associated with biopsy, driven in part by the high concentration of proteases that can leak and cause intra-abdominal infection. These constraints highlight the need for functional imaging biomarkers capable of capturing early pathophysiological changes. A prominent example is type 1 diabetes (T1DM), in which progressive destruction of beta cells begins years before overt hyperglycemia, yet no existing imaging modality can assess early inflammation or the decline of pancreatic islets. A further unmet need lies in characterizing pancreatic lesions suspected of malignancy: although malignant and benign lesions differ in their physical properties, current imaging methods do not reliably distinguish between them. To examine the clinical utility of our method, we evaluate performance in test–retest reproducibility study ($N=7$) and a T1DM versus healthy differentiation task ($N=8$). The proposed approach achieves the lowest Wasserstein distances across repeated scans and demonstrates superior sensitivity to subtle, physiology-driven shifts in the relaxation-time distribution, outperforming classical NNLS and non-bootstrapped deep learning baselines. These results establish inference-time bootstrapping as an effective and practical enhancement for quantitative $T_2$ relaxometry in low-SNR abdominal imaging, enabling more stable and discriminative estimation of relaxation-time distributions.}
}


@InProceedings{pmlr-v315-ren26a,
  title = 	 {Can You Trust Your Model? Constructing Uncertainty Approximations Guaranteeing Validity of Glioma Segmentation Explanations},
  author =       {Ren, Tianyi and Low, Daniel and Xiang, Rachel and Jaengprajak, Pittra and Rivera, Juampablo Heras and Olson, Riley and Ruzevick, Jacob and Kurt, Mehmet},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1407--1421},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/ren26a/ren26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/ren26a.html},
  abstract = 	 {Deep learning models have been successfully applied to glioma segmentation from multi-contrast MRI, yet model reasoning is difficult to validate clinically. Prior work used contrast-level Shapley values to explain how individual MRI sequences contribute to segmentation performance, and showed that alignment between these explanations and protocol-derived contrast rankings is associated with improved model performance. However, a single trained model may not reflect the optimal population-level model, and naive Deep Ensemble uncertainty estimates provide no guarantees that the true optimal explanation lies within their intervals. In this work, we construct statistically valid uncertainty intervals for contrast-level Shapley values in glioma segmentation. Using a U-Net trained on the BraTS 2024 GoAT dataset, we compute Shapley values for each MRI contrast and tumor sub-region, form naive uncertainty estimations from cross-validation, and then apply a frequentist framework based on uniform convergence to define a confidence set of plausibly optimal models. By optimizing mixed objectives that trade off empirical loss and Shapley value, we approximate the Pareto frontier and obtain lower and upper bounds on the optimal explanation. We compare these intervals with clinically derived consensus and protocol rankings. Our results demonstrate that naive uncertainty estimations can lead to inconclusive or misleading conclusions about clinical alignment, whereas frequentist intervals provide principled guarantees on coverage of the optimal explanation and show moderate correlation with annotator consensus, enabling more reliable validation of model explanations against established clinical reasoning.}
}


@InProceedings{pmlr-v315-stolt-anso26a,
  title = 	 {NISF$++$: Geometrically-grounded implicit representations of 3D$+$time cardiac function from 2D short- and long-axis MR views},
  author =       {Stolt-Ans\'o, Nil and Dannecker, Maik and Jia, Steven and McGinnis, Julian and Rueckert, Daniel},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1422--1444},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/stolt-anso26a/stolt-anso26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/stolt-anso26a.html},
  abstract = 	 {Clinical acquisition in cardiac magnetic resonance (CMR) imaging involves obtaining cross-sectional planes of the heart along the radial and longitudinal directions. Despite these planes being 2D cross-sectional images of the heart, radiologists understand the 3D spatial and continuous temporal nature of the organ being imaged. The same can not be said about the conventional deep learning architectures used to process CMR images, which rely on in-plane and grid-based operations, and are hence unable to organically integrate information from all imaging planes. In this paper, we build upon previous work on neural implicit segmentation functions (NISF) to overcome unaddressed challenges in cardiac function modeling in the CMR domain. For a given subject, our architecture builds a shared 3D+time representations from all available acquisition planes regardless of orientation. By design, predictions along any imaging plane orientation are cross-sections of the same 3D representation, leading to spatio-temporal consistency across all slices. Moreover, our architecture makes the rotation and translation parameters of imaging planes learnable, allowing us to correct for the commonplace respiratory and patient motion between slice acquisitions under a rigid assumption. Furthermore, interpolation of intensities and segmentation can be performed in 4D at any desired resolution. We perform our study on a 120 subject sub-cohort of CMR imaging data from the UK-Biobank. We show our in-plane segmentation performance to be on-par with existing CMR segmentation methods and explore how the majority of failure cases arise from limitations in the ground-truth segmentation, for which our representations make predictions with better anatomical accuracy than its original training data. We also evaluate our motion-correction capabilities, displaying quantitative and qualitative improvements in slice alignment. Our qualitative results explore how our representations can be derived irrespective of missing acquisition planes and opens up avenues towards modeling complex sub-structures such as papillary muscles.}
}


@InProceedings{pmlr-v315-heras-rivera26a,
  title = 	 {BTReport: A Framework for Brain Tumor Radiology Report Generation with Clinically Relevant Features},
  author =       {Heras Rivera, Juampablo E. and Chen, Dickson T. and Ren, Tianyi and Low, Daniel K. and Ruzevick, Jacob and Ben Abacha, Asma and Santamaria-Pang, Alberto and Kurt, Mehmet},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1445--1472},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/heras-rivera26a/heras-rivera26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/heras-rivera26a.html},
  abstract = 	 {Recent advances in radiology report generation (RRG) have been driven by large paired image-text datasets; however, progress in neuro-oncology RRG has been limited due to a scarcity in open paired image-report datasets. Here, we introduce BTReport, an open-source framework for brain tumor RRG that constructs natural language radiology reports using reliably extracted quantitative imaging features. Unlike existing approaches that rely on general-purpose or fine-tuned vision-language models for both image interpretation and report composition, BTReport performs deterministic feature extraction of clinically-relevant features, then uses large language models only for syntactic structuring and narrative synthesis. By separating RRG into deterministic feature extraction and report generation stages, synthetically generated reports are completely interpretable and contain reliable numerical measurements, a key component lacking in existing RRG frameworks. We validate the clinical relevance of BTReport-derived features, and demonstrate that BTReport-generated reports more closely resemble reference clinical reports when compared to existing baseline RRG methods. To further research in neuro-oncology RRG, we introduce BTReport-BraTS, a companion dataset that augments BraTS imaging with synthetic radiology reports generated with BTReport, and BTReview, a web-based platform for validating the clinical quality of synthetically generated radiology reports.}
}


@InProceedings{pmlr-v315-dannecker26a,
  title = 	 {Fast and Explicit: Slice-to-Volume Reconstruction via 3D Gaussian Primitives with Analytic Point Spread Function Modeling},
  author =       {Dannecker, Maik and Jia, Steven and Stolt-Ans{\'o}, Nil and Girard, Nadine and Auzias, Guillaume and Rousseau, Fran{\c{c}}ois and Rueckert, Daniel},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1473--1491},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/dannecker26a/dannecker26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/dannecker26a.html},
  abstract = 	 {Recovering high-fidelity 3D images from sparse or degraded 2D images is a fundamental challenge in medical imaging, with broad applications ranging from 3D ultrasound reconstruction to MRI super-resolution. In the context of fetal MRI, high-resolution 3D reconstruction of the brain from motion-corrupted low-resolution 2D acquisitions is a prerequisite for accurate neurodevelopmental diagnosis. While implicit neural representations (INRs) have recently established state-of-the-art performance in self-supervised slice-to-volume reconstruction (SVR), they suffer from a critical computational bottleneck: accurately modeling the image acquisition physics requires expensive stochastic Monte Carlo sampling to approximate the point spread function (PSF). In this work, we propose a shift from neural network based implicit representations to Gaussian based explicit representations. By parameterizing the HR 3D image volume as a field of anisotropic Gaussian primitives, we leverage the property of Gaussians being closed under convolution and thus derive a closed-form analytical solution for the forward model. This formulation reduces the previously intractable acquisition integral to an exact covariance addition ($\mathbf{\Sigma}_{obs} = \mathbf{\Sigma}_{HR} + \mathbf{\Sigma}_{PSF}$), effectively bypassing the need for compute-intensive stochastic sampling while ensuring exact gradient propagation. We demonstrate that our approach matches the reconstruction quality of self-supervised state-of-the-art SVR frameworks while delivering a 5$\times$–10$\times$ speed-up on neonatal and fetal data. With convergence often reached in under 30 seconds, our framework paves the way towards translation into clinical routine of real-time fetal 3D MRI.}
}


@InProceedings{pmlr-v315-varma26a,
  title = 	 {SegMaST: Mamba-based Spatio-Temporal Modeling to Improve Longitudinal Disease Detection and Segmentation},
  author =       {Varma, Aswathi and Weidner, Jonas and Lux, Laurin and Bercea, Cosmin and M{\"{u}}hlau, Mark and Kirschke, Jan and Wiestler, Benedikt and Rueckert, Daniel},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1492--1508},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/varma26a/varma26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/varma26a.html},
  abstract = 	 {Longitudinal medical image segmentation is fundamental for quantifying disease progression and evaluating treatment efficacy. However, two critical challenges persist: First, methods that jointly segment baseline and follow-up images remain underexplored, often missing the contextual benefits of simultaneous assessment and lacking longitudinal consistency. Second, real-world datasets typically exhibit severe class imbalance, as scans showing actual disease progression are far rarer than those showing stable anatomy, an issue frequently neglected by existing models. To address these limitations, we propose SegMaST, a novel Mamba-based spatio-temporal framework. Unlike conventional approaches that treat timepoints in isolation, SegMaST leverages cross-temporal information and spatial correspondences to jointly segment the initial baseline mask and explicitly localize new or progressive pathologies in follow-up scans. Additionally, we introduce an imbalance-aware loss accumulation strategy to enhance robustness in realistic clinical settings. On longitudinal cohorts of patients with Multiple Sclerosis (MS) and glioma, SegMaST outperforms established CNN- and attention-based baselines for follow-up segmentation (mean follow-up Dice MS in-house 0.536, MSSEG-2 0.620, and glioma 0.631) and lesion detection (F1 in-house 0.688, MSSEG-2 0.723), while maintaining state-of-the-art accuracy in baseline segmentation (Dice: 0.617 MS, 0.844 glioma).}
}


@InProceedings{pmlr-v315-tan26a,
  title = 	 {A-ADAPT: Adaptive Intracranial Artery Segmentation with Morphology-Guided Prompts and Difficulty-Aware Learning},
  author =       {Tan, Zhiwei and Wang, Xin and Wang, Meng and Liu, Zixuan and Guo, Yin and Xia, Jiamin and Balu, Niranjan and Shapiro, Linda and Yuan, Chun and Mossa-Basha, Mahmud},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1509--1522},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/tan26a/tan26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/tan26a.html},
  abstract = 	 {Accurate segmentation of intracranial arteries in CTA and MRA is essential for cerebrovascular analysis but remains challenging due to fine-scale artery morphology, modality-dependent appearance, and frequent structural discontinuities. Existing CNN or Transformer based models struggle to generalize across modalities, while SAM-based methods rely heavily on manually provided prompts and often fail to preserve thin or low-contrast arteries. We propose A-ADAPT, an adaptive intracranial artery segmentation framework that enhances SAM with modality-aware representation learning, automatic morphology-guided prompting, and difficulty-aware optimization. First, a Cross-Modality Task Adapter (CMTA) aligns CTA and MRA feature distributions while preserving shared vascular characteristics. The Frequency Adapter (FA) and the Tubular Morphology Adapter(TMA) work together to refine artery representation by enhancing structural detail and highlighting the continuity of tubular anatomy. To eliminate dependence on manual prompts, we introduce an Automatic Directional Morphology Prompt Encoder (AutoDM-Prompt), which generates artery-aware prompts directly from the input image. Additionally, a difficulty-aware loss dynamically upweights uncertain or discontinuity-prone regions, enabling the model to better recover small branches and reduce false positives. Experiments on CTA and MRA datasets show that A-ADAPT achieves higher accuracy, and better structural continuity compared to several state-of-the-art methods.}
}


@InProceedings{pmlr-v315-sun26a,
  title = 	 {A Simple yet Effective Adaptive Inter-organ Contrastive Learning Framework for Unsupervised Domain Adaptation},
  author =       {Sun, Yiyou and Gao, Zheyao and Zhou, Xiaogen and Dou, Qi and Chiu Wing Chu, Winnie},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1523--1538},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/sun26a/sun26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/sun26a.html},
  abstract = 	 {Strong unsupervised domain adaptation (UDA) in multi-organ segmentation seeks to unify complementary information from heterogeneous imaging protocols within a single model without sacrificing source-modality performance, yet the substantial domain gap between modalities makes feature-level alignment non-trivial. Pseudo-label learning (PLL) has emerged as the dominant paradigm, but it suffers from information loss due to hard thresholding and bias introduced by class imbalance and noisy predictions. Contrastive learning (CL) offers a complementary direction by structuring semantic constrast, yet existing voxel-level formulations incur prohibitive computational costs on volumetric data and fail to capture the global anatomical context critical for organ segmentation. In this work, we propose Adaptive Inter-organ Contrastive Learning (AICL), a unified UDA framework for 3D multi-organ cross-modality segmentation that exploits PPL and CL synergistically to facilitate better cross-modality feature alignment. AICL employs dynamic soft pseudo-labels as guidance in the feature latent space to organize for inter-organ samples as positive-negative pairs for CL. Meanwhile, the model is trained with supervised consistency learning (SCL) using mixed ground truths and pseudo-labels, promoting a more discriminative and compact shared latent space. Extensive experiments and ablation studies on an orbital and a cardiac dataset reveal the effectiveness of each component and a significant advancement in segmentation results.}
}


@InProceedings{pmlr-v315-moens26a,
  title = 	 {One-by-One Stainer: A Fast and Hallucination Resilient Domain Adaptation Method for Histopathology},
  author =       {Moens, Karel and De Vylder, Jonas and Tuytelaars, Tinne and Blaschko, Matthew B.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1539--1564},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/moens26a/moens26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/moens26a.html},
  abstract = 	 {Histological staining is a crucial step in the analysis of tissue samples, enabling pathologists to identify and diagnose diseases. However, variations in staining protocols and equipment can lead to inconsistencies in image quality. As a result, histological stain normalization remains an active area of research with increasing demand for AI driven diagnosis support. Unlike other domain adaptation problems, in pathology, the consequences of image hallucinations, obscuring or inserting information, are much greater. We propose a method that mitigates the risk of hallucinations by simplifying the network architecture and training process. Our fully $1\times1$ convolutional architecture prevents textural modifications and we show that a residual connection combined with weight regularization effectively suppresses color information loss. The model does not need supervision from CycleGAN based models. It is trained directly and leverages target domain color distribution information for better convergence without requiring any paired images. As a result, our method can be trained concurrently on images at varying scales, showing differing anatomical structures or dyes. This simplifies the dataset collection, facilitates the adoption at new centers, and reduces the number of models needed at inference.}
}


@InProceedings{pmlr-v315-liu26b,
  title = 	 {NeuroLangSeg: Language-Guided Subcortical Segmentation with Pseudo-Supervision and Anatomical–Linguistic Validation},
  author =       {Liu, Ruiying and Liu, Jialu and Zhang, Xuzhe and Huang, Chuang and Wang, Yun},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1565--1597},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/liu26b/liu26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/liu26b.html},
  abstract = 	 {Recent advances in vision–language models and LLMs have introduced contextual anatomical reasoning into brain MRI segmentation. However, the field still suffers from a fundamental limitation: the absence of a unified anatomical definition of the structures being segmented. Existing datasets rely on labels produced by heterogeneous manual workflows, often lacking explicit anatomical criteria or consistent annotation standards. As a result, models learn and evaluate within isolated labeling systems, limiting cross-model comparison and valid anatomical measurements. To address these challenges, we introduce NeuroLangSeg, a language-guided framework that enforces a consistent anatomical protocol for subcortical segmentation. A key component of the framework is an anatomical–linguistic evaluator that acts as a training discriminator, encouraging the model to produce outputs by assessing shape characteristics, protocol-defined spatial relationships, and age- and sex-adjusted volumetric norms. Building upon this constraint, NeuroLangSeg integrates a pretrained image encoder with protocol-aligned anatomical prompts and a masked pseudo-labeling strategy, enabling data-efficient and interpretable learning under limited supervision. Together, these components yield anatomically consistent segmentations and support subject-level reporting grounded in a unified anatomical standard. Evaluation across diverse MRI datasets—including comparisons with state-of-the-art models—shows that NeuroLangSeg achieves +4.1 DSC / +8.0 NSD in in-site settings and +3.6 DSC / +14.5 NSD in cross-site generalization over the average baseline, enabled by its LLM–visual integration, while delivering anatomically verifiable predictions suitable for both research and clinical use.}
}


@InProceedings{pmlr-v315-jiang26a,
  title = 	 {LightRefine-PCXR: A Lightweight Refinement Framework for Efficient Medical Device Suppression in Pediatric Chest X-Rays},
  author =       {Jiang, Mingze and Li, Xueyang and Kheir, John and Girten, Alec and Shi, Yiyu},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1598--1617},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/jiang26a/jiang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/jiang26a.html},
  abstract = 	 {In pediatric chest radiography, indwelling support devices (e.g., tubes and lines) are ubiquitous and often obscure critical thoracic structures, complicating radiologic interpretation and reducing the reliability of automated analysis methods. Although generative inpainting has advanced rapidly, reliable deployment in pediatric chest radiographs remains challenging. Subtle anatomical cues must be preserved under substantial domain shift, while full adaptation of large backbones is often impractical because of limited pediatric data and constrained clinical compute budgets. To address these limitations, we propose LightRefine-PCXR, a lightweight, backbone-agnostic refinement framework for suppressing medical devices in pediatric chest X-rays (PCXRs). LightRefine-PCXR follows a two-stage strategy: a frozen pretrained inpainting backbone first produces a coarse device-removed estimate, and a compact anatomy-aware refiner then predicts mask-constrained residual corrections to restore local structures while preserving all unmasked pixels exactly. This plug-in design substantially reduces trainable parameters and peak GPU memory compared with end-to-end fine-tuning, yet consistently improves reconstruction fidelity and perceptual quality across diverse inpainting paradigms, including CNN-, transformer-, and diffusion-based models. Comprehensive in-domain and cross-dataset experiments demonstrate robust device suppression and strong generalization in low-data pediatric settings, highlighting the practicality of LightRefine-PCXR for real-world pediatric radiology workflows.}
}


@InProceedings{pmlr-v315-gao26a,
  title = 	 {Temporal Memory Enhancement for Semantic Segmentation in Surgical Video},
  author =       {Gao, Zheyao and Wu, Qian and Chen, Yueyao and Chen, Cheng and Yip, Hon Chi and Chu, Winnie Chiu Wing and Dou, Qi},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1618--1636},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/gao26a/gao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/gao26a.html},
  abstract = 	 {Segmenting critical anatomical structures in surgical videos can enhance precision and patient safety by alerting surgeons to potential complications. While current methods that store features from past frames have advanced the performance in video segmentation, their reliance on a fixed-range local memory often fails to capture complex temporal contexts of surgical scenes. Specifically, the memory could fill with redundant features or omit informative frames due to the non-uniform rate of operations by the surgeons. Besides, the image features in the same phase of the surgery share similar patterns, while local memory could not capture such long-term relationships. Therefore, we propose a memory enhancement method to enrich the local temporal context and incorporate global phase context for surgical video semantic segmentation. Concretely, we improve the local memory with a feature selection module based on Determinantal Point Process (DPP) to choose past features that are diverse and relevant to the current feature. Besides, we introduce a global memory to store the common patterns of frames within each phase based on the conditional variational autoencoder with a mixture of Gaussian priors (CVAE-MoG). Experiments on endoscopic submucosal dissection (ESD) and laparoscopic cholecystectomy (LC) video segmentation demonstrate that our method achieves superior performance over existing methods.}
}


@InProceedings{pmlr-v315-hardy26a,
  title = 	 {Rendering with a Gut Feeling: Depth-Guided Triangle Splatting for Physically Consistent Colonoscopic Reconstruction},
  author =       {Hardy, Romain and Beltran, Andrea Dunn and Brenner, Todd A. and Berzin, Tyler M. and Rajpurkar, Pranav},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1637--1655},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/hardy26a/hardy26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/hardy26a.html},
  abstract = 	 {Colonoscopy scene reconstruction under monocular imaging remains challenging due to affine depth ambiguity in geometric priors and strong viewpoint-dependent specularities from coaxial illumination. We present GutSee, a depth-guided triangle splatting framework that addresses these challenges through two key innovations. First, we introduce an affine-invariant depth supervision scheme that accounts for per-frame scale and shift ambiguities in pretrained monocular depth estimators, enabling them to provide stable geometric guidance even when their predictions are mutually inconsistent. Second, we incorporate a physically motivated illumination model with an explicit coaxial spotlight and learnable BRDF parameters, preventing specular highlights from being misinterpreted as geometry. Together with triangle primitives that naturally enforce surface continuity, these components yield reconstructions that are both geometrically faithful and photometrically realistic. On a phantom colonoscopy dataset, GutSee reduces mean depth RMSE by 16.1% over the next-best method under biased supervision while maintaining comparable rendering quality. These results demonstrate that coupling affine-invariant depth guidance with physically accurate lighting models improves resilience to supervision bias, enabling reliable reconstruction even when using imperfect depth priors.}
}


@InProceedings{pmlr-v315-manescu26a,
  title = 	 {MILCA: Malaria Parasite Detection from Sample-Level Weak Labels},
  author =       {Manescu, Petru and Fernandez-Reyes, Delmiro},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1656--1674},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/manescu26a/manescu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/manescu26a.html},
  abstract = 	 {Malaria diagnosis requires the inspection of multiple image fields per sample. Training vision models for malaria parasite detection typically requires large numbers of expert-provided bounding boxes, which are costly to obtain and often impractical in real-world deployments. We introduce MILCA, a weakly supervised object detection framework that learns parasite localization from sample-level diagnostic labels, which are routinely recorded in clinical practice. MILCA combines Multiple Instance Learning (MIL) for sample classification with an iterative Class Activation (CA) Mapping procedure that yields coarse parasite pseudo-labels, which are further enriched with hard negatives from parasite-free samples. These pseudo-labels enable training a detector without any manual bounding-box supervision. Experiments on multiple microscopy datasets show that MILCA achieves reliable detection and counting performance under fully weak supervision, and that fine-tuning with only a small fraction of expert annotations provides substantial additional gains, outperforming supervised and pseudo-labeling baselines under the same or lower annotation budgets. By converting coarse, sample-level clinical labels into effective object-level supervision, MILCA provides a label-efficient route toward automated malaria parasite detection and a general approach for weakly supervised blood film analysis.}
}


@InProceedings{pmlr-v315-li26e,
  title = 	 {Endo-SemiS: Towards Robust Semi-Supervised Image Segmentation for Endoscopic Video},
  author =       {Li, Hao and Lu, Daiwei and Yao, Xing and Kavoussi, Nicholas and Oguz, Ipek},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1675--1696},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/li26e/li26e.pdf},
  url = 	 {https://proceedings.mlr.press/v315/li26e.html},
  abstract = 	 {In this paper, we present Endo-SemiS, a semi-supervised segmentation framework for providing reliable segmentation of endoscopic video frames with limited annotation. Endo-SemiS uses 4 strategies to improve performance by effectively utilizing all available data, particularly unlabeled data: (1) Cross-supervision between two individual networks that supervise each other; (2) Uncertainty-guided pseudo-labels from unlabeled data, which are generated by selecting high-confidence regions to improve their quality; (3) Joint pseudo-label supervision, which aggregates reliable pixels from the pseudo-labels of both networks to provide accurate supervision for unlabeled data; and (4) Mutual learning, where both networks learn from each other at the feature and image levels, reducing variance and guiding them toward a consistent solution. Additionally, a separate corrective network that utilizes spatiotemporal information from endoscopy video to improve segmentation performance. Endo-SemiS is evaluated on two clinical applications: kidney stone laser lithotomy from ureteroscopy and polyp screening from colonoscopy. Compared to state-of-the-art segmentation methods, Endo-SemiS substantially achieves superior results on both datasets with limited labeled data.}
}


@InProceedings{pmlr-v315-li26f,
  title = 	 {EndoStreamDepth: Temporally Consistent Monocular Depth Estimation for Endoscopic Video Streams},
  author =       {Li, Hao and Lu, Daiwei and Wang, Jiacheng and Webster, Robert J. and Oguz, Ipek},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1697--1721},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/li26f/li26f.pdf},
  url = 	 {https://proceedings.mlr.press/v315/li26f.html},
  abstract = 	 {This work presents EndoStreamDepth, a monocular depth estimation framework for endoscopic video streams. It provides accurate depth maps with sharp anatomical boundaries for each frame, temporally consistent predictions across frames, and real-time throughput. Unlike prior work that uses batched inputs, EndoStreamDepth processes individual frames with a temporal module to propagate inter-frame information. The framework contains three main components: (1) a single-frame depth network with endoscopy-specific transformation to produce accurate depth maps, (2) multi-level Mamba temporal modules that leverage inter-frame information to improve accuracy and stabilize predictions, and (3) a hierarchical design with comprehensive multi-scale supervision, where complementary loss terms jointly improve local boundary sharpness and global geometric consistency. We conduct comprehensive evaluations on two publicly available colonoscopy depth estimation datasets, with quantitative results reported on phantom and simulated data that provide ground truth depth. Compared to state-of-the-art monocular depth estimation methods, EndoStreamDepth substantially improves performance, and it produces depth maps with sharp, anatomically aligned boundaries, which are essential to support downstream tasks such as automation for robotic surgery.}
}


@InProceedings{pmlr-v315-suh26a,
  title = 	 {Landmark Detection Uncertainty as a Reliability Weight for Robust Landmark-based 2D/3D Pelvic Pose Estimation},
  author =       {Suh, Yehyun and Schott, Brayden and Mo, Chou and Martin, J. Ryan and Moyer, Daniel},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1722--1739},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/suh26a/suh26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/suh26a.html},
  abstract = 	 {Landmark-based 2D/3D pelvis registration is vulnerable to noisy or ambiguous landmark detections in fluoroscopy, which can destabilize downstream pose estimation. We present an uncertainty-aware registration framework that models epistemic uncertainty in predicted landmarks and incorporates it directly into the Perspective-n-Point formulation. Using Monte Carlo dropout within a U-Net detector, we compute sample-specific per-landmark reliability estimates using the variance of multiple stochastic forward passes. These reliability estimates guide two complementary strategies: continuous weighting, which integrates uncertainty into a weighted PnP optimization, and discrete selection, which removes the most uncertain landmarks during inference. We evaluate the framework on both CT-derived synthetic fluoroscopy and real fluoroscopy from DeepFluoro. Our experiments show that uncertainty provides a principled mechanism for identifying unreliable landmarks and stabilizing pose estimation, enabling more robust registration and establishing a foundation for uncertainty-guided image-guided surgical workflows.}
}


@InProceedings{pmlr-v315-nizam26a,
  title = 	 {X-Cardia: Phenotype-Guided Cross-Modal Alignment for Opportunistic Cardiac Screening on Routine Chest CT},
  author =       {Nizam, Nusrat Binta and Liu, Fengbei and Kwak, Sunwoo and Richter, Ilan and Raikhelkar, Jayant K and Beecy, Ashley and Uriel, Nir and Estrin, Deborah and Sabuncu, Mert R},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1740--1767},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/nizam26a/nizam26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/nizam26a.html},
  abstract = 	 {comment Deep learning models for cardiac prognostics often operate within single-modality frameworks, limiting their ability to capture physiologically meaningful cross-modal relationships. In particular, we focus on non-gated, non-contrast chest computed tomography (CT) scans that are typically acquired for entirely non-cardiac indications, rather than for dedicated cardiac assessment. We introduce X-Cardia, a phenotype-guided multimodal alignment framework that transfers structural cardiac phenotypes from echocardiography (ECHO) and electrocardiography (ECG) into CT representations by enforcing explicit phenotype-level consistency. This setting is intrinsically challenging because the lack of cardiac gating obfuscates the cardiac phase and the absence of contrast limits the visibility of cardiovascular structures, but these scans represent a rich resource for opportunistic cardiac screening. The approach combines CLIP-style contrastive pre-training to align image and tabular embeddings with a non-parametric Nadaraya–Watson phenotype head, which uses a support-bank to guide the latent space toward clinically meaningful axes. This enables the image encoder to learn physiological features that are generalized beyond the modality boundaries. We pre-train using data from 20,574 patients and fine-tune the resulting image encoder on ten cardiac abnormality prediction tasks. The proposed method consistently outperforms both the standard contrastive learning and the baseline without pre-training, achieving a gain of up to 8% of AUROC on the test set. In the 5-shot setting, phenotype-guided alignment improves AUROC by an average of 9.8% over baselines, demonstrating strong data efficiency and generalization from few labeled samples. Our results show that explicit phenotype-guided alignment yields interpretable, data-efficient representations that transfer cardiac knowledge to non-cardiac CTs, defining a promising paradigm for multimodal medical imaging. comment Multimodal medical data offer an opportunity to learn general-purpose representations for cardiovascular diagnosis. We introduce X-Cardia, a cardiac phenotype-guided multimodal framework that uses structured data as intermediate supervision during pre-training. X-Cardia learns to extract cardiac information from non-contrast, non-gated chest CT scans by aligning CT features with tabular measurements derived from echocardiography (ECHO) and electrocardiography (ECG). Our method combines CLIP-style contrastive pre-training with a non-parametric Nadaraya–Watson (NW) prediction head that enforces phenotype-level similarity via exemplar-based alignment. Pre-training on 20,574 patients, followed by fine-tuning on ten cardiac abnormality prediction tasks, yields substantial performance gains. X-Cardia improves AUROC by up to 8% on the held-out test set and delivers an average 11.8% AUROC improvement in a 5-shot regime. These results demonstrate that explicit phenotype alignment produces interpretable, data-efficient representations and enables routine chest CT to support opportunistic cardiac screening.}
}


@InProceedings{pmlr-v315-liu26c,
  title = 	 {HyperCT: Low-Rank Hypernet for Unified Chest CT Analysis},
  author =       {Liu, Fengbei and Kwak, Sunwoo and Phung, Hao and Nizam, Nusrat Binta and Richter, Ilan and Uriel, Nir and Averbuch-Elor, Hadar and Estrin, Deborah and Sabuncu, Mert R.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1768--1801},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/liu26c/liu26c.pdf},
  url = 	 {https://proceedings.mlr.press/v315/liu26c.html},
  abstract = 	 {Non-contrast chest CTs offer a rich opportunity for both conventional pulmonary and opportunistic extra-pulmonary screening. While Multi-Task Learning (MTL) can unify these diverse tasks, standard hard-parameter sharing approaches are often suboptimal for modeling distinct pathologies. We propose HyperCT, a framework that dynamically adapts a Vision Transformer backbone via a Hypernetwork. To ensure computational efficiency, we integrate Low-Rank Adaptation (LoRA), allowing the model to regress task-specific low-rank weight updates rather than full parameters. Validated on a large-scale dataset of radiological and cardiological tasks, HyperCT outperforms various strong baselines, offering a unified, parameter-efficient solution for holistic patient assessment.}
}


@InProceedings{pmlr-v315-khawaled26a,
  title = 	 {FutureMorph: Toward Predicting Future Deformation Fields in Longitudinal Imaging},
  author =       {Khawaled, Samah and Sabuncu, Mert R.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1802--1820},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/khawaled26a/khawaled26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/khawaled26a.html},
  abstract = 	 {Understanding how anatomy evolves over time is essential for tracking disease progression, quantifying risk, and studying healthy development and aging. Existing approaches either synthesize future images without modeling geometry or perform longitudinal registration that requires follow-up scans. We introduce FutureMorph, a framework that treats longitudinal forecasting as metadata-conditioned prediction of future diffeomorphic deformation fields. Given a baseline image (e.g., a brain MRI) and subject-level metadata (age, sex, and clinical variables), FutureMorph predicts time-indexed, subject-specific diffeomorphic deformation fields that explicitly capture future anatomical change. We employ a metadata-conditioned U-Net to estimate stationary velocity vector fields, which are integrated into smooth diffeomorphisms and applied using a spatial transformer to synthesize future images. Experiments on the OASIS-3 dataset show that our framework produces clinically meaningful predicted deformations and realistic future scans, capturing age- and interval-dependent trajectories. Our work provides a new perspective for longitudinal imaging studies by unifying image synthesis and deformation modeling.}
}


@InProceedings{pmlr-v315-ben-haddou26a,
  title = 	 {CyclePhase: Robust phase detection in cardiovascular imaging through cyclic motion estimation},
  author =       {Ben Haddou, Soufiane and van Herten, Rudolf L. M. and Bezzina, Connie R. and Planken, R. Nils and Daemen, Joost and Wentzel, Jolanda J. and Henriques, Jos{\'e} P. and I{\v{s}}gum, Ivana},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1821--1839},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/ben-haddou26a/ben-haddou26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/ben-haddou26a.html},
  abstract = 	 {Accurate cardiac phase detection is essential for cardiovascular imaging applications requiring temporally aligned measurements. While existing methods treat phase detection as discrete frame classification, we propose a fundamentally different approach that models cardiac phase as a continuous cyclic variable on the unit circle. Our method introduces gradient-based input transformations to isolate motion from static anatomy, thereby making it robust to appearance variations, such as calcifications, in intravascular ultrasound (IVUS). Through multi-objective optimization combining temporal consistency via Earth mover’s distance with continuous phase regression, we achieve superior performance across both IVUS and cardiac MRI. Experiments demonstrate that explicitly modelling cardiac periodicity yields more accurate and temporally coherent phase detection compared to classification-based approaches, with particular improvements in artefact-heavy clinical scenarios. Our unified framework eliminates the need for modality-specific preprocessing or segmentation masks, providing an end-to-end solution for cardiac motion characterization.}
}


@InProceedings{pmlr-v315-ji26a,
  title = 	 {Unpaired Multimodal Learning for Biological Datasets},
  author =       {Ji, Zongliang and Eastwood, Cian and Goldenberg, Anna and Liang, Paul Pu and Hartford, Jason and Krishnan, Rahul G. and Noutahi, Emmanuel},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1840--1868},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/ji26a/ji26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/ji26a.html},
  abstract = 	 {Multimodal learning holds tremendous promise for biology, providing a path to integrate diverse data types and ultimately construct a more complete picture of underlying biological mechanisms. However, most existing approaches for multimodal learning require paired samples—an impractical assumption in biology, where measurement devices often destroy samples (e.g., RNA sequencing). To address this challenge, we introduce IntraPair InterCluster (IPIC), a novel contrastive approach for multimodal learning that departs from traditional reliance on paired data by requiring only treatment-group labels. IPIC aligns modalities through intra-treatment group matching and inter-treatment group clustering, producing embeddings that are both accurate and biologically meaningful. In experiments on four curated multimodal biological datasets, IPIC consistently outperforms baseline approaches, highlighting its effectiveness in leveraging independently collected single-modality datasets for multimodal contrastive pre-training.}
}


@InProceedings{pmlr-v315-leem26a,
  title = 	 {REVEAL: Multimodal Vision–Language Alignment of Retinal Morphometry and Clinical Risks for Incident AD and Dementia Prediction},
  author =       {Leem, Seowung and Gu, Lin and You, Chenyu and Gong, Kuang and Fang, Ruogu},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1869--1889},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/leem26a/leem26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/leem26a.html},
  abstract = 	 {The retina provides a unique, noninvasive window into Alzheimer’s disease and dementia, capturing early structural changes through morphometric features, while systemic and lifestyle risk factors reflect well-established contributors to AD and dementia susceptibility long before clinical symptom onset. However, current retinal analysis frameworks typically model imaging and risk factors separately, preventing them from capturing the joint multimodal patterns that are critical for early risk prediction. Moreover, existing methods rarely incorporate mechanisms to organize or align patients with similar retinal and clinical characteristics, limiting their ability to learn coherent cross-modal associations. To address these limitations, we introduce REVEAL (REtinal-risk Vision-language Early Alzheimer’s Learning) that aligns color fundus photographs with individualized disease-specific risk profiles for incident AD and dementia prediction on average 8 years before diagnosis (range: 1–11 years). Because real-world risk factors are structured questionnaire data, we first translate them into clinically interpretable narratives compatible with pretrained vision-language models (VLMs). We further propose a group-aware contrastive learning (GACL) strategy that clusters patients with similar retinal morphometry and risk factors as positive pairs, strengthening multimodal alignment. This unified representation-learning framework substantially outperforms state-of-the-art retinal imaging models paired with clinical text encoders, as well as general VLMs, demonstrating the value of jointly modeling retinal biomarkers and clinical risk factors. By providing a generalizable, noninvasive approach for early AD and dementia risk stratification, REVEAL has the potential to enable earlier interventions and improve preventive care at the population level.}
}


@InProceedings{pmlr-v315-shakya26a,
  title = 	 {MedKamba: A Novel Approach Integrating State-Space Models and Fractional Kolmogorov–Arnold Networks for Medical Image Segmentation},
  author =       {Shakya, Amit and Yadav, Akanksha and Kumar, Rupesh and Sharma, Lalit},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1890--1902},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/shakya26a/shakya26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/shakya26a.html},
  abstract = 	 {Medical image segmentation plays a crucial role in healthcare, serving as a key step in disease diagnosis and treatment planning. Convolutional neural networks (CNNs) are limited by their restricted receptive fields, whereas Transformer-based models suffer from quadratic computational cost. Recent advances such as Mamba, a selective state-space model with linear complexity, and its vision-oriented variant, the Visual State Space (VSS) models, have shown strong ability to capture long-range dependencies efficiently. However, they still exhibit shortcomings in segmentation tasks, including loss of pixel-level structural information and inefficient channel utilization. To address this, we introduce VSSM-based Local Aware Channel Enhancement (LACE) block, which incorporates local enhancement and channel attention to better preserve spatial detail. To this end, we proposed MedKamba, a novel U-shaped segmentation approach that employs a hybrid encoder with CNNs and LACE blocks to effectively capture both local and global contextual information. While the U-Net backbone remains highly efficient, its traditional skip connections rely on simple scale-matched fusion, limiting cross-scale interaction. To overcome this, we redesign the skip connections using Fractional Kolmogorov–Arnold Networks (f-KANs) to generate channel-wise attention weights from features aggregated across multiple stages. Experiments on two benchmark datasets demonstrate that MedKamba consistently outperforms competing approaches and produces more visually accurate segmentation results.}
}


@InProceedings{pmlr-v315-kirchner26a,
  title = 	 {Federated EndoViT: Pretraining Vision Transformers via Federated Learning on Endoscopic Image Collections},
  author =       {Kirchner, Max and Jenke, Alexander C. and Bodenstedt, Sebastian and Kolbinger, Fiona R. and Saldanha, Oliver L. and Kather, Jakob N. and Wagner, Martin and Speidel, Stefanie},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1903--1934},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/kirchner26a/kirchner26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/kirchner26a.html},
  abstract = 	 {Purpose: Data privacy regulations hinder the creation of generalizable foundation models (FMs) for surgery by preventing multi-institutional data aggregation. This study investigates federated learning (FL) as a privacy-preserving solution to collaboratively train robust surgical FMs. Methods: We introduce Federated EndoViT (FL-EndoViT), a federated framework that validates the Masked Autoencoder (MAE) pretraining strategy in a decentralized surgical setting. To ensure convergence under severe data heterogeneity, the architecture integrates adaptive Sharpness-Aware Minimization (FedSAM). Pretrained on the large-scale Endo700k dataset, FL-EndoViT is evaluated against a centralized baseline on different tasks including scene segmentation, action recognition, and phase recognition. Results: FedSAM is critical for successful pretraining, overcoming the convergence failures of standard federated methods. The resulting FL-EndoViT performs comparably to its centralized counterpart, with significant advantages in data-scarce, high-resolution segmentation and generalization to new surgical events. We also establish that full, end-to-end fine-tuning is necessary for optimal performance. Conclusion: This work validates FL with adaptive optimization as a viable paradigm for creating robust, privacy-preserving surgical FMs. Our findings provide a scalable framework for collaborative Surgical Data Science and underscore the optimizer’s critical role in handling data heterogeneity. Future work should explore video-based models to incorporate spatiotemporal dynamics.}
}


@InProceedings{pmlr-v315-gundersen26a,
  title = 	 {RadVLM-GRPO: Enhancing Chest X-ray Report Generation and Visual Grounding via Reinforcement Learning},
  author =       {Gundersen, Benjamin and Deperrois, Nicolas and Ruiperez-Campillo, Samuel and Sutter, Thomas M. and Vogt, Julia E. and Moor, Michael and Nooralahzadeh, Farhad and Krauthammer, Michael},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1935--1968},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/gundersen26a/gundersen26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/gundersen26a.html},
  abstract = 	 {Recent advances in vision-language models (VLMs) have improved Chest X-ray (CXR) interpretation in multiple aspects. However, many medical VLMs rely solely on supervised fine-tuning (SFT), which optimizes next-token prediction without evaluating answer quality. In contrast, reinforcement learning (RL) can incorporate task-specific feedback, and its combination with explicit intermediate reasoning (“thinking”) has demonstrated substantial gains on verifiable math and coding tasks. To investigate the effects of RL and thinking in a CXR VLM, we perform large-scale SFT on CXR data to build an updated RadVLM based on Qwen3-VL, followed by a cold-start SFT stage that equips the model with basic thinking ability. We then apply Group Relative Policy Optimization (GRPO) with clinically grounded, task-specific rewards for report generation and visual grounding, and run matched RL experiments on both domain-specific and general-domain Qwen3-VL variants, with and without thinking. Across these settings, we find that while strong SFT remains crucial for high base performance, RL provides additional gains on both tasks, whereas explicit thinking does not appear to further improve results. Under a unified evaluation pipeline, the RL-optimized RadVLM models outperform their baseline counterparts and reach state-of-the-art performance on both report generation and grounding, highlighting clinically aligned RL as a powerful complement to SFT for medical VLMsCode is available at and the updated SFT and RL models will be released under a new version at .}
}


@InProceedings{pmlr-v315-shor26a,
  title = 	 {T1-PILOT: Physics-Informed Learned Optimized Trajectories for T1 Mapping Acceleration},
  author =       {Shor, Tamir and Freiman, Moti and Baskin, Chaim and Bronstein, Alex},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1969--1982},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/shor26a/shor26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/shor26a.html},
  abstract = 	 {Cardiac T1 mapping provides critical quantitative insights into myocardial tissue composition, enabling the assessment of pathologies such as fibrosis, inflammation, and edema. However, the inherently dynamic nature of the heart imposes strict limits on acquisition times, making high-resolution T1 mapping a persistent challenge. Compressed sensing (CS) approaches have reduced scan durations by undersampling k-space and reconstructing images from partial data, and recent studies show that jointly optimizing the undersampling patterns with the reconstruction network can substantially improve performance. Still, most current T1 mapping pipelines rely on static, hand-crafted masks that do not exploit the full acceleration and accuracy potential. Furthermore, most existing methods do not levarege the physical T1 decay model in optimization. In this work, we introduce T1-PILOT: an end-to-end method that explicitly incorporates the T1 signal relaxation model into the sampling–reconstruction framework to guide the learning of non-Cartesian trajectories, cross-frame alignment, and T1 decay estimation. Through extensive experiments on the CMRxRecon dataset, T1-PILOT significantly outperforms several baseline strategies (including learned single-mask and fixed radial or golden-angle sampling schemes), achieving higher T1 map fidelity at greater acceleration factors. In particular, we observe consistent gains in PSNR and VIF relative to existing methods, along with marked improvements in delineating finer myocardial structures. Our results highlight that optimizing sampling trajectories in tandem with the physical relaxation model leads to both enhanced quantitative accuracy and reduced acquisition times.}
}


@InProceedings{pmlr-v315-weng26a,
  title = 	 {SEG4SEG: Identifying Systematic Failure Modes in Segmentation by Subgroup Discovery Methods},
  author =       {Weng, Nina and Petersen, Eike and Bissoto, Alceu and Sun, Susu and Koch, Lisa M. and Feragen, Aasa and Bigdeli, Siavash and Baumgartner, Christian F.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1983--2002},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/weng26a/weng26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/weng26a.html},
  abstract = 	 {Deep learning models for medical image segmentation can achieve high overall performance but fail systematically on critical subgroups. While Slice Discovery Methods (SDM) have shown promise in revealing classification failures, their effectiveness for segmentation remains unexplored. Moreover, although various systematic failures have been reported in segmentation tasks, no prior work has systematically categorized them. In this work, we address both gaps. First, we categorize potential sources of systematic errors in medical image segmentation. Second, we empirically investigate whether SDMs can identify problematic slices in each of those categories without manual annotations. Our evaluation covers four controlled failure types and two real-world failure cases, using medical imaging datasets and explicit success criteria for SDM evaluation. Our experiments show that SDMs adapted for segmentation can identify systematic errors, demonstrating their potential for failure analysis in medical imaging.}
}


@InProceedings{pmlr-v315-alvarez-florez26a,
  title = 	 {Tagged-Informed Prior for Motion Quantification in Cine CMR Using Implicit Neural Representations},
  author =       {Alvarez-Florez, Laura and Ben Haddou, Soufiane and Tjong, Fleur V. Y. and Igum, Ivana},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2003--2018},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/alvarez-florez26a/alvarez-florez26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/alvarez-florez26a.html},
  abstract = 	 {Accurate quantification of myocardial motion from cine cardiac magnetic resonance (CMR) is essential for assessing cardiac function. Although tagged CMR provides high-fidelity measurements of myocardial deformation, its longer acquisition time limits routine clinical use, making cine CMR motion estimation the more widely applicable approach. Implicit neural representations (INRs) offer a promising framework for cine-based motion estimation by modelling cardiac motion as a continuous spatio-temporal function. However, they require subject-specific optimisation and are sensitive to initialization, leading to slow convergence. Furthermore, optimisation from random initialization can lead to large number of solutions that may not guarantee biomechanically plausible motion. To address these limitations, we propose a strategy to improve and accelerate INR-based registration of cine CMR by leveraging a population-level prior derived from tagged CMR data. First, we train subject-specific INRs on the tagged cine dataset to encode characteristic myocardial deformation patterns. Second, we aggregate their parameters across subjects to form a tagged-informed population prior. Third, we use this prior initialization to warm-start the optimization of cine INRs. The resulting prior provides a physiologically meaningful starting point for cine-only INR optimisation, reducing the search space and promoting more realistic cardiac motion. We develop and test the method on the UK Biobank. Compared with standard initialization, the proposed prior enables the INR to reach near-optimal performance using only half as many optimisation steps, achieving a 4% improvement in Dice and a 15% reduction in Hausdorff distance. These gains also translate to a test set of 855 subjects from a different institution, encompassing different pathologies, where the prior yields smoother and more physiologically plausible strain curves.}
}


@InProceedings{pmlr-v315-akoda26a,
  title = 	 {Beyond Diffusion: Consistency Models for One-Step, High-Fidelity MRI Reconstruction},
  author =       {Akoda, Mary-Brenda and Qin, Chen},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2019--2037},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/akoda26a/akoda26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/akoda26a.html},
  abstract = 	 {Magnetic resonance imaging (MRI) provides excellent soft-tissue contrast, but suffers from long acquisition times, limiting throughput and increasing patient discomfort. Diffusion-based generative models have recently achieved state-of-the-art reconstruction quality for accelerated MRI, but typically require hundreds to thousands of neural function evaluations (NFEs), which severely limits their practicality in time-sensitive clinical settings. We introduce C-MORE (Consistency-Model-based One-step REconstruction for MRI), to our knowledge, the first one-step consistency model framework for accelerated MRI reconstruction. C-MORE investigates an unconditional one-step prior and solves the inverse problem in one NFE by leveraging measurement-guided encoding and tunable physics-based refinement, thus eliminating multi-NFE diffusion sampling, while retaining a controllable quality-speed trade-off. On the MICCAI CMR$\times$Recon dataset spanning multiple cardiac contrasts and both single- and multi-coil acquisitions, C-MORE outperforms state-of-the-art diffusion-based samplers and strong non-diffusion unrolled methods across accelerations in just 1 NFE, while reconstructing images in $0.18-0.52$ s ($\approx$$22-193$$\times$ faster than diffusion-based methods requiring hundreds of NFEs). Remarkably, without any retraining or finetuning, C-MORE also demonstrates cross-anatomy generalisation to the unseen fastMRI knee dataset from NYU Langone Health and Facebook AI Research, again surpassing state-of-the-art methods across accelerations. These results establish C-MORE as a practical blueprint for real-time, high-fidelity MRI reconstruction across diverse contrasts, acquisition settings, anatomies, and accelerations.}
}


@InProceedings{pmlr-v315-hamdy26a,
  title = 	 {Decoupling Vision and Reasoning: A Data-Efficient Pipeline for Surgical VQA},
  author =       {Hamdy, Mohamed and Ahmed, Fatmaelzahraa Ali and Abdel-Ghani, Muraam and Arsalan, Muhammad and Suganthan, Ponnuthurai Nagaratnam and Al-Jalham, Khalid and Al-Ali, Abdulaziz and Balakrishnan, Shidin},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2038--2056},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/hamdy26a/hamdy26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/hamdy26a.html},
  abstract = 	 {Vision-language models (VLMs) are becoming increasingly important for surgical intelligence, where reliable scene understanding requires combining visual perception with language-based reasoning. However, progress is constrained by the scarcity of high-quality multimodal datasets, making end-to-end training more prone to overfitting. Existing approaches often address this limitation by converting task-specific datasets (e.g., segmentation, phase recognition, tool-tissue interaction) into synthetic vision-question answering (VQA) form, but such conversions provide only sparse supervision and limit generalization. To overcome these challenges, we propose Surg-SAGE (Structured Abstraction from Granular Experts), a modular pipeline that decouples vision information extraction from reasoning. Specialist surgical models–proven effective for their corresponding vision tasks–are first used to extract task-relevant signals, which are then transformed via heuristics into structured textual descriptions. These descriptions, together with the clinical question, are passed to a large language model (LLM) that performs the reasoning step and provides the answer. The novelty of this work lies in demonstrating that decoupling perception from language processing and leveraging expert-trained specialist models enables strong VQA performance, even when paired with relatively lightweight, frozen LLMs and without requiring multimodal training data. We evaluate this pipeline on the EndoVis-18-VQA benchmark under different configurations of specialist models and LLMs, showing that combining complementary experts yields stronger performance than relying on any single model. Surg-SAGE achieves higher accuracy, recall and F1 than existing surgical VQA baselines, with improvements of up to 2.3% in accuracy without requiring multimodal training, establishing abstraction-driven modularity as a data-efficient and generalizable paradigm for surgical vision-language understanding.}
}


@InProceedings{pmlr-v315-huang26a,
  title = 	 {ECT-3DMedSAM: Efficient Cross Teaching Using Segment Anything Model for Semi-Supervised 3D Medical Image Segmentation},
  author =       {Huang, Zhewen and Guariglia, Sara R. and Yang, Jiaqi and Tsai, Chia-Ling},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2057--2070},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/huang26a/huang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/huang26a.html},
  abstract = 	 {As precise manual annotation for medical imaging is both expert-intensive and costly, Semi-Supervised Medical Image Segmentation (SSMIS) provides a critical solution by leveraging large volumes of unlabeled data to achieve the high-performance segmentation necessary for anatomical structure analysis and disease diagnosis. Standard SSMIS models typically train specialized models with limited initialization, often failing to capture the complex semantic nuances of 3D anatomy. Foundation models offer superior generalization capabilities by leveraging large-scale pre-training but still struggle to adapt effectively when downstream annotations are limited. In this paper, we propose a novel cross-teaching framework tailored for the efficient adaptation of the 3D foundation model (MedSAM-2). We introduce a parameter-efficient design that shares frozen image and prompt encoders between two parallel, Low-Rank Adaptation (LoRA) learnable mask decoders. Furthermore, we replace the memory-intensive attention mechanism with a light-weight temporal propagation module for reducing the memory consumption while maintaining critical local volumetric coherence. Our model processes the same input volume through weakly and strongly augmentations to create a synergistic learning loop where the two decoders mutually supervise each other. We validate our method across three distinct datasets and modalities. Experimental results demonstrate that our framework effectively bridges the domain gap, achieving a 57.9% reduction in the average 95% Hausdorff Distance, substantially enhancing boundary precision for fine anatomical structures. Furthermore, our approach outperforms state-of-the-art baselines with a Dice score improvement of up to 2.8%, confirming its robustness and clinical reliability for volumetric segmentation.}
}


@InProceedings{pmlr-v315-tastan26a,
  title = 	 {MultiPersistence Topological Fusion with Vision Transformers for Skin Cancer Detection},
  author =       {Tastan, Fulya and Chakraborty, Sayoni and Lee, Sangyeon and Coskunuzer, Baris},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2071--2096},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/tastan26a/tastan26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/tastan26a.html},
  abstract = 	 {Skin cancer is a common and potentially fatal disease where early detection is crucial, especially for melanoma. Current deep learning systems classify skin lesions well, but they primarily rely on appearance cues and may miss deeper structural patterns in lesions. We present TopoCon-MP, a method that extracts multiparameter topological signatures from dermoscopic images to capture multiscale lesion structure, and fuses these signatures with Vision Transformers using a supervised contrastive objective. Across three public datasets, TopoCon-MP improves in-distribution performance over strong pretrained CNN and ViT baselines, and in cross-dataset transfer, it maintains competitive performance. Ablations show that both multiparameter topology and contrastive fusion contribute to these gains. The resulting topological channels also provide an interpretable view of lesion organization that aligns with clinically meaningful structures. Overall, TopoCon-MP demonstrates that multipersistence-based topology can serve as a complementary modality for more robust skin cancer detection.}
}


@InProceedings{pmlr-v315-dutta26a,
  title = 	 {Quantum-Inspired Orthonormal CNN for Energy-Efficient Medical Image Denoising},
  author =       {Dutta, Sayantan},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2097--2117},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/dutta26a/dutta26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/dutta26a.html},
  abstract = 	 {Medical imaging modalities (MRI, CT, PET, US) are often degraded by acquisition noise, which obscures subtle anatomical details and compromises diagnostic reliability. Conventional denoising approaches, including spatial filters and deep learning (DL) models, often struggle to balance noise suppression with preservation of fine structures, and state-of-the-art architectures typically incur high computational and energy costs. This work introduces a novel quantum-inspired convolutional neural network (QICNN) that embeds principles of orthonormal basis representation and unitary channel mixing into a compact UNet-style architecture. By constraining convolutional kernels to orthonormal subspaces and enforcing norm-preserving transformations, QICNN eliminates feature redundancy, stabilizes optimization, and maintains energy consistency across layers. Evaluations on real noisy brain MRI datasets show that QICNN achieves superior texture fidelity and lesion conspicuity compared to standard DL models, as evidenced by improvements in GLCM-based metrics and contrast-to-noise ratio. In addition to quality gains, QICNN reduces parameter count by $\sim$93%, inference latency by $\sim$98%, and energy consumption by $\sim$97% relative to transformer-scale denoisers, significantly lowering computational overhead and carbon footprint. These findings highlight the potential of physics-guided design to deliver interpretable, efficient, and clinically robust solutions for medical image restoration.}
}


@InProceedings{pmlr-v315-byun26a,
  title = 	 {Test-Time Scaling in Clinical Decision Making},
  author =       {Byun, Ji Young and Park, Young-Jin and Azizan, Navid and Chellappa, Rama},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2118--2142},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/byun26a/byun26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/byun26a.html},
  abstract = 	 {Large language models (LLMs) have demonstrated remarkable capabilities in complex reasoning and knowledge-intensive tasks, yet their potential for clinical decision making through test-time scaling (TTS) remains largely unexplored. While TTS has shown promise in improving reasoning performance by leveraging additional inference-time computation, its effectiveness in the medical domain has not been systematically investigated. This gap is further exacerbated by the impracticality of supervised fine-tuning for clinical reasoning tasks, owing to limited data availability and high annotation costs. In this work, we present a comprehensive study of TTS for clinical decision making. We systematically investigate the interaction between TTS and inference strategies, including direct answering, chain-of-thought prompting, and two-stage reasoning. We generate multiple candidate outputs in parallel using large reasoning models and aggregate them via self-consistency decoding. This approach does not need any supervision while it leverages additional inference-time computation to improve the performance. We provide a comprehensive empirical evaluation across both text-based medical question answering benchmarks and medical imaging modalities, demonstrating consistent improvements over single-pass inference baselines with performance gains of up to 30 percentage points. Finally, we provide an analytical characterization of TTS, deriving scaling laws that describe how performance improves with the number of samples and identifying conditions under which TTS yields reliable gains, along with empirical validation on diverse medical decision-making tasks.}
}


@InProceedings{pmlr-v315-zhang26a,
  title = 	 {CardAIc-Agents: A Multimodal Framework with Hierarchical Adaptation for Cardiac Care Support},
  author =       {Zhang, Yuting and Bunting, Karina V. and Champsi, Asgher and Wang, Xiaoxia and Lu, Wenqi and Thorley, Alexander and Hothi, Sandeep S and Qiu, Zhaowen and Buyukates, Baturalp and Kotecha, Dipak and Duan, Jinming},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2143--2170},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zhang26a/zhang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zhang26a.html},
  abstract = 	 {Cardiovascular diseases (CVDs) remain the foremost cause of mortality worldwide, a burden worsened by a severe deficit of healthcare workers. Artificial intelligence (AI) agents have shown potential to alleviate this gap through automated detection and proactive screening, yet their clinical application remains limited by: (1) rigid sequential workflows, whereas clinical care often requires adaptive reasoning that selects specific tests and, based on their results, guides personalised next steps; (2) reliance solely on intrinsic model capabilities to perform role assignment without domain-specific tool support; (3) general and static knowledge bases without continuous learning capability; and (4) fixed unimodal or bimodal inputs and lack of on-demand visual outputs when clinicians require visual clarification. In response, a multimodal framework, CardAIc-Agents, is proposed to augment models with external tools and adaptively support diverse cardiac tasks. First, a CardiacRAG agent generates task-aware plans from updatable cardiac knowledge, while the Chief agent integrates tools to autonomously execute these plans and deliver decisions. Second, to enable adaptive and case-specific customization, a stepwise update strategy is developed to dynamically refine plans based on preceding execution results, once the task is assessed as complex. Third, a multidisciplinary discussion team is proposed which is automatically invoked to interpret challenging cases, thereby supporting further adaptation. In addition, visual review panels are provided to assist validation when clinicians raise concerns. Experiments across three datasets showed the efficiency of CardAIc-Agents compared to mainstream Vision–Language Models (VLMs) and state-of-the-art agentic systems.}
}


@InProceedings{pmlr-v315-byun26b,
  title = 	 {Adaptive Inference for Medical Vision Transformers: Token Reduction or Early Exit?},
  author =       {Byun, Ji Young and Lee, HyunSeo and Shuff, Jordan and Venkatesh, Rengaraj and Shekhawat, Nakul S. and Parikh, Kunal S. and Chellappa, Rama},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2171--2191},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/byun26b/byun26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/byun26b.html},
  abstract = 	 {Vision Transformers (ViTs) have demonstrated exceptional performance in medical image analysis, yet their computational demands hinder clinical deployment, particularly in time-sensitive applications. Medical imaging requires sample-adaptive optimization due to dataset heterogeneity across modalities and sample complexity; uniform strategies do not well balance efficiency and accuracy. We propose a unified adaptive inference framework that combines Token Reduction (TR) and Early Exiting (EE) through dataset-specific profiling. Our approach quantifies spatial redundancy via Jensen-Shannon Divergence (JSD) and prediction confidence at intermediate layers to train a lightweight predictor that dynamically selects inference strategies at test time. Across five medical datasets, including a real-world cataract dataset (INSIGHT), our framework achieves 71.4% average floating-point operations (FLOPs) reduction with only 0.1pp accuracy loss, substantially outperforming individual strategies (EE-only: 55.9%, TR-only: 57.7%). On PathMNIST, our adaptive inference framework simultaneously improves accuracy by 1.3pp while reducing computation by 77.2%. On INSIGHT, we maintain baseline accuracy with 69.8% FLOPs reduction, demonstrating robust real-world clinical applicability.}
}


@InProceedings{pmlr-v315-cosarinsky26a,
  title = 	 {CheXmask-U: Quantifying uncertainty in landmark-based anatomical segmentation for X-ray images},
  author =       {Cosarinsky, Matias and Gaggion, Nicolas and Echeveste, Rodrigo and Ferrante, Enzo},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2192--2207},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/cosarinsky26a/cosarinsky26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/cosarinsky26a.html},
  abstract = 	 {In this work, we study uncertainty estimation for anatomical landmark-based segmentation on chest X-rays. Inspired by hybrid neural network architectures that combine standard image convolutional encoders with graph-based generative decoders, and leveraging their variational latent space, we derive two complementary measures: (i) latent uncertainty, captured directly from the learned distribution parameters, and (ii) predictive uncertainty, obtained by generating multiple stochastic output predictions from latent samples. Through controlled corruption experiments we show that both uncertainty measures increase with perturbation severity, reflecting both global and local degradation. We demonstrate that these uncertainty signals can identify unreliable predictions by comparing with manual ground-truth, and support out-of-distribution detection on the CheXmask dataset. More importantly, we release CheXmask-U (dataset), a large scale dataset of 657,566 chest X-ray landmark segmentations with per-node uncertainty estimates, enabling researchers to account for spatial variations in segmentation quality when using these anatomical masks. Our findings establish uncertainty estimation as a promising direction to enhance robustness and safe deployment of landmark-based anatomical segmentation methods in chest X-ray. A fully working interactive demo of the method is available at CheXmask-U-demo and the source code at CheXmask-U-code.}
}


@InProceedings{pmlr-v315-arvapalli26a,
  title = 	 {Evidential Retriever: Uncertainty-Aware Medical Image Retrieval},
  author =       {Arvapalli, Sai Susmitha and Namboodiri, Vinay P.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2208--2232},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/arvapalli26a/arvapalli26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/arvapalli26a.html},
  abstract = 	 {Medical image retrieval systems could play a vital role in clinical decision support by enabling physicians to find visually and semantically similar cases from large medical databases. However, deep learning-based retrieval models often overlook uncertainty in their predictions. To address this, we propose the Evidential Retriever, a novel architecture that combines evidential deep learning principles with transformer-based image representations to achieve more accurate and calibrated retrieval. Built upon a Swin Transformer backbone, our model features a dual-headed design: a retrieval head that performs metric learning for robust image embeddings, and an evidential head that models predictive uncertainty. We use a unified dual-loss, combining a regularized contrastive loss with an evidential loss. Experiments on five diverse medical imaging datasets: CheXpert, NIH-14, ISIC17, COVID-QU-Ex, and KVASIR - demonstrate that our method outperforms state-of-the-art retrieval models in retrieval accuracy and uncertainty estimation. Furthermore, we demonstrate that our evidential framework is architecture-agnostic and can be used to improve the calibration of large-scale Foundation Models.}
}


@InProceedings{pmlr-v315-wang26e,
  title = 	 {What Fine-Tuning Changes: A Radiomic Lens on Prostate Foundation Model Representations},
  author =       {Wang, Yipei and Chen, Yaxi and Yan, Wen and Thorley, Natasha and Ng, Alexander and Barratt, Dean C. and Alexander, Daniel C. and Punwani, Shonit and Emberton, Mark and Kasivisvanathan, Veeru and Hu, Yipeng},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2233--2247},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wang26e/wang26e.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wang26e.html},
  abstract = 	 {Clarifying how foundation model encoders change during fine-tuning is important for transparency and trustworthiness in their medical imaging applications. It may also be useful for further understanding, developing, and adapting these models. However, the latent representations produced by such encoders are high dimensional and lack explicit semantic meaning, making it difficult to characterise how task-specific adaptation modifies them. In this study, we introduce a radiomics-based framework that provides an interpretable lens through which these representational changes can be examined and often better understood. Using prostate cancer patient imaging data, we train a two-layer MLP to learn the relationship between radiomic descriptors and encoder embeddings prior to fine-tuning. This model captures non-linear associations through its first layer, while the final linear layer offers an interpretable mapping from radiomic attributes to (transformed) latent features. To quantify the effect of fine-tuning, the first layer is fixed, and only the linear layer is re-estimated using the embeddings from the fine-tuned encoder. Comparing the pre- and post-fine-tuning linear weights yields a direct quantitative measure of how the encoder’s emphasis on specific radiomic characteristics shifts during fine-tuning. We validate the approach using a prostate MRI foundation model and multiple downstream tasks. The analysis reveals consistent, task-dependent changes in the encoder’s sensitivity to radiomic texture and intensity features. This work provides the first radiomics-based methodology for systematically interpreting how fine-tuning restructures foundation model representation in medical imaging.}
}


@InProceedings{pmlr-v315-wang26f,
  title = 	 {EFIQA: Explainable Fundus Image Quality Assessment via Anatomical Priors},
  author =       {Wang, Pengwei and Morano, Jos\'{e} and Wan, Qian and Bogunovi\'c, Hrvoje},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2248--2264},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wang26f/wang26f.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wang26f.html},
  abstract = 	 {Image quality control is vital for a wide range of downstream applications. Deep learning-based image quality assessment methods typically train classifiers on dataset-specific quality labels, inheriting two limitations: (1) generalization is tied to the labeling criteria of the training set and (2) these methods cannot provide spatial feedback on where the quality is degraded, lacking explainability. In this work, we propose EFIQA, a framework that requires no quality-related supervision and produces spatial quality maps by design. Rather than learning “what is degradation" from human-annotated labels, EFIQA learns “what should be there" by leveraging anatomical priors. For fundus photography, we instantiate this as a two-stage approach, by first training an unsupervised anomaly detector via masked anatomical inpainting to identify regions of missing vasculature, and then distilling this prior knowledge into a shallow adapter mapping features of a frozen foundation model to precise quality maps. External-dataset evaluation demonstrates that this label-free approach with minimal adaptation achieves better performance and explainability compared with supervised methods across benchmarks with different quality criteria, highlighting its potential for real-world applications.}
}


@InProceedings{pmlr-v315-rahman26a,
  title = 	 {GenVOG-DiT: A Transformer-Based Diffusion Model for Pose-Driven, Patient-Agnostic Nystagmus VOG Video Generation},
  author =       {Rahman, Aimon and Green, Kemar E. and Patel, Vishal M.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2265--2282},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/rahman26a/rahman26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/rahman26a.html},
  abstract = 	 {Nystagmus, an involuntary eye movement indicative of neurological and vestibular disorders, is traditionally diagnosed using costly equipment or expert visual inspection: both of which limit accessibility in nonspecialist settings. Recent advances in computer vision and deep learning present an opportunity to automate the detection of nystagmus from standard video recordings. However, progress is hindered by the scarcity of publicly available video datasets due to privacy concerns surrounding ocular biometric data. In this work, we propose the use of synthetically generated eye movement videos to mitigate data limitations. Using video diffusion models, we simulate diverse clinically plausible nystagmus patterns without relying on real patient data, enabling scalable training while preserving privacy. We show that models trained on synthetic data generalize effectively to real-world settings and show potential for integration into telehealth applications. Our approach advances the development of accessible, generalizable, and privacy-aware diagnostic tools for eye movement disorders.}
}


@InProceedings{pmlr-v315-shi26b,
  title = 	 {Real-Time Novel-View Freehand Ultrasound Imaging via Point-Cloud Rendering and Diffusion-Bridge Completion},
  author =       {Shi, Hanrui and Mailh{\'e}, Boris and Zhang, Zheyuan and Liu, Yikang and Chen, Xiao and Mukherjee, Ankush and Chen, Terrence and Sun, Shanhui},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2283--2296},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/shi26b/shi26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/shi26b.html},
  abstract = 	 {Freehand ultrasound imaging is limited by sparse sampling and restricted probe coverage, which prevent consistent visualization of unseen planes and oblique orientations. We propose a real-time framework for novel-view ultrasound imaging that combines point-cloud rendering with diffusion-bridge completion. Given a sequence of 2D B-mode images and tracked probe poses, each novel view is first rendered as a partially observed slice from the reconstructed point cloud geometry, then completed by an Image-to-Image Schr{ö}dinger Bridge (I$^2$SB) model to synthesize anatomically coherent textures. The diffusion-bridge formulation accelerates convergence by conditioning on visible regions instead of noise, enabling stochastic yet efficient generation. A latent I$^2$SB variant further improves computational efficiency for high-resolution ultrasound data. Experiments on an abdominal dataset demonstrate realistic novel-view synthesis with fine structural continuity and real-time inference ($<$0.2 seconds per view), outperforming standard diffusion inpainting baselines in both speed and visual fidelity. The proposed method provides an efficient generative approach for interactive and view-adaptive ultrasound visualization.}
}


@InProceedings{pmlr-v315-zhao26a,
  title = 	 {Clinical Risk-Aware Multi-Level Grading for Coronary Artery Stenosis through Curved Feature Reconstruction},
  author =       {Zhao, Shishuang and Li, Hongtai and Hou, Junjie and Liu, Yuhang},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2297--2310},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zhao26a/zhao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zhao26a.html},
  abstract = 	 {Developing a multi-level grading model for coronary artery stenosis holds great clinical significance for the diagnosis of coronary artery disease. However, designing an effective multi-level deep learning algorithm faces significant challenges. Specifically, utilizing CCTA or 3D SCPR images alone presents inherent shortcomings: CCTA images are difficult to analyze due to the tortuous paths of blood vessels, while 3D SCPR images are prone to abnormal distortions that hinder accurate grading. Furthermore, different stenosis grades are associated with varying clinical risks, and incorporating this association into the algorithm is non-trivial. To address the former problems, we propose the Curved Feature Reconstruction (CFR) module, which uses vessel curves as prior and employs a point-by-point correspondence strategy to precisely align and fuse features from both 3D SCPR and CCTA images. Meanwhile, a Clinical Risk-Aware (CR) Loss is employed to introduce clinical risk relevance into the network training so that the algorithm can better align with the clinical diagnosis. The experimental results on a in-house dataset reveal that our approach significantly outperforms other methods, and several ablation studies also demonstrate the effectiveness of our proposed designs.}
}


@InProceedings{pmlr-v315-le26a,
  title = 	 {Ultra-ECP: Ellipse-Constrained and Point-Robust Foundation Model Adaptation for Fetal Cardiac Ultrasound Segmentation},
  author =       {Le, Minh H. N. and Le, Khanh T. Q. and Vinh, Tuan and Nguyen, Thanh-Huy and Huynh, Han H. and Pham, Khoa D. and Vu, Anh Mai and Kha, Hien Q. and Nguyen, Phat K. and Bagci, Ulas and Xu, Min and Yang, Carl and Huynh, Phat K. and Le, Nguyen Quoc Khanh},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2311--2320},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/le26a/le26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/le26a.html},
  abstract = 	 {Accurate fetal cardiac segmentation from four-chamber ultrasound images is essential for reliable prenatal biometrics, yet foundation models such as SAM remain sensitive to point-prompt placement, produce anatomically inconsistent masks, and require costly full-model fine-tuning. We introduce Ultra-ECP, a parameter-efficient framework that adapts UltraSAM for robust single-point fetal cardiac segmentation. Ultra-ECP integrates three components: (i) a LoRA-based adaptation applied to the prompt encoder and mask decoder, reducing trainable parameters by over 98%; (ii) an Ellipse-Aware Loss that regularizes predictions toward anatomically plausible elliptical cardiac shapes; and (iii) a Point-Robust Augmentation strategy that simulates click imprecision to enhance robustness. Evaluated on the FOCUS dataset, Ultra-ECP outperforms SAM, MedSAM, and fine-tuned U-Net baselines. For thoracic segmentation, it achieves a mean DSC of 95.09% and HD95 of 25.96 px. For cardiac segmentation, Ultra-ECP obtains a mean DSC of 92.60% and HD95 of 18.25 px, while maintaining stability under point displacements of up to 10 pixels. Predictions are consistently smooth and elliptical, addressing common failure modes of existing approaches. Ultra-ECP provides an effective and computationally lightweight pathway for adapting large vision models to fetal cardiac biometrics, enabling reliable and clinically practical semi-automated tools.}
}


@InProceedings{pmlr-v315-di26a,
  title = 	 {Lesion-Aware Reconstruction with Principal Network: Enhancing Pseudo-Label Reliability in Semi-Supervised Clinical Lesion Detection},
  author =       {DI, Shiwan and LI, Jupeng and YANG, Yuxuan and JIN, Qian and AN, Guorui and YANG, Jingwen and WANG, Yue and GUO, Yong and ZHANG, Xinyue and MA, Ruohan and LI, Gang},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2321--2337},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/di26a/di26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/di26a.html},
  abstract = 	 {Purpose. In lesion detection tasks, labeled medical data are often scarce, limiting the performance of fully supervised models. Teacher-student (TS) frameworks based on semi-supervised learning (SSL) have emerged as effective solutions to leverage unlabeled data. However, the inherent high-confidence bias of teacher networks frequently leads to the propagation of erroneous pseudo-labels, degrading the generalization ability of student networks. To address this critical issue, we propose a novel teacher-principal-student (TPS) framework. Methods. The core innovation lies in introducing a principal network, which integrates lesion-aware reconstruction to filter low-quality pseudo-labels generated by the teacher network. Specifically, the principal network leverages anatomical prior knowledge and reconstruction consistency constraints to assess the reliability of teacher-generated pseudo-labels, ensuring only high-fidelity pseudo-labeled data are used for training the student network. This design fundamentally mitigates the adverse effects of the teacher prediction bias and error propagation. Results. Extensive experiments on jaw lesion detection datasets demonstrate the superiority of our approach. With the same label ratio, our SSL network achieves 81.5% mAP@0.5, outperforming mainstream SSL methods by 3.0% while narrowing the performance gap with fully supervised learning to only 3.3%. Conclusion. Our proposed TPS framework outperforms state-of-the-art SSL approaches in jaw lesion detection task. It not only achieves competitive performance comparable to fully supervised models but also significantly reduces reliance on labeled clinical data, providing a reliable technical solution to promote the clinical translation of lesion detection systems.}
}


@InProceedings{pmlr-v315-li26g,
  title = 	 {UnEBOLT: A Unified Model for EEG-to-BOLD Translation and Functional Connectivity Reconstruction},
  author =       {Li, Yamin and Lou, Ange and Li, Chang and Wang, Shiyu and Pourmotabbed, Haatef and Xu, Ziyuan and Zhang, Shengchao and Englot, Dario J. and Kolouri, Soheil and Moyer, Daniel and Bayrak, Roza G. and Chang, Catie},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2338--2351},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/li26g/li26g.pdf},
  url = 	 {https://proceedings.mlr.press/v315/li26g.html},
  abstract = 	 {Functional magnetic resonance imaging (fMRI) provides high-resolution, whole-brain dynamic information, but is costly and immobile, limiting its utility in low-resource settings. EEG-to-fMRI translation via deep learning offers a promising alternative, enabling access to deep brain activity from scalp EEG signals in naturalistic settings. However, current state-of-the-art methods for EEG-to-fMRI translation require training separate models for each brain region, limiting efficiency and scalability. Here, we introduce UnEBOLT, a Unified model for EEG-to-BOLD Translation. UnEBOLT is an end-to-end framework that predicts whole-brain fMRI time series from EEG by adaptive multi-region decoding within a single model. This approach enables efficient and comprehensive inference while also reconstructing subject-specific functional connectivity matrices, a representation that provides insight into neuronal interactions and which has been successfully utilized for clinical biomarkers. Our results show that UnEBOLT achieves comparable performance to dedicated ROI-specific models while scaling to multi-region prediction. Additionally, the reconstructed fMRI time series enable functional connectivity estimation, which may have broad applications in neuroscience.}
}


@InProceedings{pmlr-v315-wu26c,
  title = 	 {A Pragmatic Note on Evaluating Generative Models with Fr{é}chet Inception Distance for Retinal Image Synthesis},
  author =       {Wu, Yuli and Liu, Fucheng and Yilmaz, R\"uveyda and Konermann, Henning and Walter, Peter and Stegmaier, Johannes},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2352--2368},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wu26c/wu26c.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wu26c.html},
  abstract = 	 {Fr{é}chet Inception Distance (FID), computed with an ImageNet pretrained Inception-v3 network, is widely used as a state-of-the-art evaluation metric for generative models. It assumes that feature vectors from Inception-v3 follow a multivariate Gaussian distribution and calculates the 2-Wasserstein distance based on their means and covariances. While FID effectively measures how closely synthetic data match real data in many image synthesis tasks, the primary goal in biomedical generative models is often to enrich training datasets ideally with corresponding annotations. For this purpose, the gold standard for evaluating generative models is to incorporate synthetic data into downstream task training, such as classification and segmentation, to pragmatically assess its performance. In this paper, we examine cases from retinal imaging modalities, including color fundus photography and optical coherence tomography, where FID and its related metrics misalign with task-specific evaluation goals in classification and segmentation. We highlight the limitations of using various metrics, represented by FID and its variants, as evaluation criteria for these applications and address their potential caveats in broader biomedical imaging modalities and downstream tasks.}
}


@InProceedings{pmlr-v315-jaubert26a,
  title = 	 {Breaking the Memory Barrier: Efficient Multi-Class 3D Segmentation for Hundreds of Classes},
  author =       {Jaubert, Olivier and Traynor, William and Mikhael, Shadia and Hipwell, John H. and Dahdouh, Sonia},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2369--2387},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/jaubert26a/jaubert26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/jaubert26a.html},
  abstract = 	 {Medical image segmentation has transformed clinical routine by providing fast and accurate methods for the automated measurement of biomarkers and lesions. While foundation models promise broad generalization across hundreds of anatomical structures, they often under-perform compared to task-specific deep learning methods like nnUNet. However, these specialized models face scalability challenges when segmenting large numbers of classes in 3D images. We introduce a class scalable 3D segmentation method combining a low rank basis and projection operator with a chunked cross entropy and Dice loss. This design decouples the number of classes and the peak memory requirements enabling the segmentation of hundreds of classes in 3D. Integrated into the nnUNet framework, the proposed method supports state-of-the-art training and architectures. Scalability of our framework was demonstrated by creating and obtaining high Dice scores ($>0.95$) on a novel synthetic 3D “Toy Dataset” with up to 1000 different classes. Performance on the TotalSegmentator dataset (117 classes) was assessed showing comparable mean Dice scores between the proposed method and the multi-model TotalSegmentator baseline ($0.913$ vs $0.928$) and outperforming VISTA3D ($0.803$). These results highlight a practical path toward a unified, scalable foundation model for comprehensive 3D medical image segmentation of thousands of classes.}
}


@InProceedings{pmlr-v315-deng26a,
  title = 	 {Sparse Subspace Diffusion Model for Physically Consistent Accelerated MRI Reconstruction},
  author =       {Deng, Xiangyao and Shen, Zhiqiang and Dayarathna, Sanuwani and Meneses, Juan P. and Uribe, Sergio and Chen, Zhaolin},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2388--2403},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/deng26a/deng26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/deng26a.html},
  abstract = 	 {Magnetic resonance imaging (MRI) provides excellent soft-tissue contrast but suffers from long acquisition times. Accelerated MRI alleviates this issue by undersampling k-space, but this approach introduces aliasing artifacts and information loss. Traditional compressed sensing methods exploit handcrafted sparse priors, whereas deep learning approaches learn data-driven priors, but both often struggle at high acceleration rates due to severe information degradation. This study introduces a diffusion-based reconstruction framework, termed the Sparse Subspace Diffusion Model (SSDM), that performs MRI reconstruction within an adaptive sparse space. The proposed approach integrates coupling convolutional dictionary learning with diffusion-based generative modeling to decompose MR images into multiple orthogonal sparse subspaces and reconstruct them under measurement-consistency constraints. This formulation enables diffusion modeling in a physically meaningful latent space, effectively bridging the gap between data-driven learning and physics-guided reconstruction. Experimental results on the fastMRI dataset demonstrate that the proposed method achieves higher reconstruction quality than existing diffusion- and sparsity-based approaches, with better preservation of fine details and suppression of artifacts across various acceleration factors.}
}


@InProceedings{pmlr-v315-guasch-marti26a,
  title = 	 {Aloe-Vision: Robust Vision-Language Models for Healthcare},
  author =       {Guasch-Mart\'i, Jaume and Lopez-Cuena, Enrique and Su\'arez-Fern\'andez, Mart\'in and Bayarri-Planas, Jordi and Arias-Duart, Anna and Garcia-Gasulla, Dario},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2404--2426},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/guasch-marti26a/guasch-marti26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/guasch-marti26a.html},
  abstract = 	 {Large Vision-Language Models (LVLMs) specialized in healthcare are emerging as a promising research direction due to their potential impact in clinical and biomedical applications. However, progress is constrained by the scarcity of high-quality medical multimodal data, concerns about robustness in safety-critical settings, and the narrow and potentially contaminated evaluation benchmarks that limit reliable assessment. To address these issues, the field requires state-of-the-art solutions to be fully open and reproducible systems in which all components can be inspected, evaluated, and improved. This work introduces Aloe-Vision-Data, a large-scale, quality-filtered mixture which integrates both medical and general domains across multimodal and text-only sources, designed for direct use in model fine-tuning. Building on this dataset, we train the Aloe-Vision family of medical LVLMs, openly released with full weights, training recipes and data, in two scales (7B and 72B). Through comprehensive benchmarking, we demonstrate that high quality training mixtures produce balanced LVLMs which yield significant gains over the baseline models without compromising general capabilities, achieving competitive performance with respect to state-of-the-art alternatives. To support reliable evaluation, we introduce CareQA-Vision, a carefully curated vision benchmark derived from MIR and EIR exams, the residency entrance exams for medical and nursing specialists in Spain, offering novel vision questions with low likelihood of contamination. Finally, we show that current LVLMs remain vulnerable to adversarial and misleading inputs, underscoring reliability challenges in clinical contexts.}
}


@InProceedings{pmlr-v315-kumar26a,
  title = 	 {ResDCE-diff : Dynamic contrast enhanced MRI  translation in prostate cancer using residual denoising diffusion models},
  author =       {Kumar, Kishore and Ramanarayanan, Sriprabha and Ram, Keerthi and Agarwal, Harsh and Venkatesan, Ramesh and Sivaprakasam, Mohanasankar},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2427--2446},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/kumar26a/kumar26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/kumar26a.html},
  abstract = 	 {Dynamic contrast enhanced MRI (DCE-MRI) identifies early perfusion patterns of aggressive prostate tumors, but its reliance on gadolinium contrast agents limits wider clinical adoption due to safety concerns. Recently, diffusion models offer a potential solution to synthesize contrast-enhanced images directly from non-contrast MRI. Previous diffusion models for prostate DCE-MRI require long inference times as they need hundreds or thousands of sampling steps limiting practical use. Moreover, the reverse generation process for DCE-MRI synthesis starts from pure noise without explicitly utilizing the prior information present in the non-contrast inputs in the diffusion process. We propose ResDCE-diff, a residual denoising diffusion model to synthesize early and late phase DCE-MRI images from non-contrast multi-modal inputs (T2-w, Apparent diffusion coefficient, and pre-contrast MRI). The diffusion process shifts anatomical, micro-structurally relevant and physics-informed residual features between the non-contrast inputs and DCE-MRI targets. Extensive experiments using PROSTATEx dataset show that ResDCE-diff, (i) consistently outperforms previous methods across early and late DCE-MRI phases with improvement margins of +1.29 db and +1.17 dB in PSNR, +0.04 and +0.03 in SSIM respectively, (ii) requires significantly lesser diffusion steps ($\approx$ 15) compared to the baseline diffusion model, and (iii) exhibits relatively higher diagnostically relevant synthesis quality.}
}


@InProceedings{pmlr-v315-sadegheih26a,
  title = 	 {Towards Modality-Agnostic Continual Domain-Incremental Brain Lesion Segmentation},
  author =       {Sadegheih, Yousef and Merhof, Dorit and Kumari, Pratibha},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2447--2460},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/sadegheih26a/sadegheih26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/sadegheih26a.html},
  abstract = 	 {Brain lesion segmentation from multi-modal MRI often assumes fixed modality sets or predefined pathologies, making existing models difficult to adapt across cohorts and imaging protocols. Continual learning (CL) offers a natural solution but current approaches either impose a maximum modality configuration or suffer from severe forgetting in buffer-free settings. We introduce CLMU-Net, a replay-based CL framework for 3D brain lesion segmentation that supports arbitrary and variable modality combinations without requiring prior knowledge of the maximum set. A conceptually simple yet effective channel-inflation strategy maps any modality subset into a unified multi-channel representation, enabling a single model to operate across diverse datasets. To enrich inherently local 3D patch features, we incorporate lightweight domain-conditioned textual embeddings that provide global modality-disease context for each training case. Forgetting is further reduced through principled replay using a compact buffer composed of both prototypical and challenging samples. Experiments on five heterogeneous MRI brain datasets demonstrate that CLMU-Net consistently outperforms popular CL baselines. Notably, our method yields an average Dice score improvement of $\geq 18%$ while remaining robust under heterogeneous-modality conditions. These findings underscore the value of flexible modality handling, targeted replay, and global contextual cues for continual medical image segmentation.}
}


@InProceedings{pmlr-v315-tankel26a,
  title = 	 {INFORM-CT: INtegrating LLMs and VLMs FOR Incidental Findings Management in Abdominal CT},
  author =       {Tankel, Idan and Mazor, Nir and Brada, Rafi and Lebedis, Christina and Ben-Yosef, Guy},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2461--2473},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/tankel26a/tankel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/tankel26a.html},
  abstract = 	 {Incidental findings in CT scans, though often benign, can have significant clinical implications and should be reported according to established guidelines. Traditional manual inspection by radiologists is time-consuming and subject to variability. This paper proposes a novel framework that leverages large language models (LLMs) and foundational vision–language models (VLMs) within a plan-and-execute agentic architecture to improve the efficiency and precision of incidental-findings detection, classification, and reporting in abdominal CT scans. Given medical guidelines for abdominal organs, the management process is automated through a planner–executor framework. The planner, based on an LLM, generates Python scripts from predefined base functions, while the executor runs these scripts to perform the required detections and evaluations using VLMs, segmentation models, and image-processing subroutines. We demonstrate the effectiveness of our approach through experiments on a CT-abdominal benchmark covering three organs, in a fully automatic end-to-end setup. Our results show that the proposed framework outperforms existing purely VLM-based approaches in both accuracy and efficiency.}
}


@InProceedings{pmlr-v315-silbernagel26a,
  title = 	 {RNCA: Self-Repairing Segmentation Masks},
  author =       {Silbernagel, Malte and Alonso, Albert and Petersen, Jens and Ibragimov, Bulat and de Bruijne, Marleen and Wyburd, Madeleine K.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2474--2495},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/silbernagel26a/silbernagel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/silbernagel26a.html},
  abstract = 	 {Accurately predicting topologically correct masks remains a difficult task for general segmentation models, which often produce fragmented or disconnected outputs. Fixing these artifacts typically requires handcrafted refinement rules or architectures specialized to a particular task. Here, we show that Neural Cellular Automata (NCA) can be directly repurposed as an effective refinement mechanism, using local, iterative updates guided by image context to repair segmentation masks. By training on imperfect masks and ground truths, the automaton learns the structural properties of the target shape while relying solely on local information. When applied to coarse, globally predicted masks, the learned dynamics progressively reconnect broken regions, prune loose fragments and converge towards stable, topologically consistent results. We show how refinement NCA () can be easily applied to repair common topological errors produced by different base segmentation models and tasks: for fragmented retinal vessels, it yields 2–3% gains in Dice/clDice and improves Betti Errors, reducing $\beta_0$ errors by 60% and $\beta_1$ by 20%; for myocardium, it repairs 61.5% of broken cases in a zero-shot setting while lowering ASSD and HD by 19% and 16%, respectively. This showcases NCA as effective and broadly applicable refiners.}
}


@InProceedings{pmlr-v315-he26a,
  title = 	 {Incentivizing DINOv3 Adaptation for Medical Vision Tasks via Feature Disentanglement},
  author =       {He, Zhicheng and Fu, Yibing and Jin, Yueming},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2496--2513},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/he26a/he26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/he26a.html},
  abstract = 	 {The emerging general vision foundation models such as DINOv3 have demonstrated remarkable representation learning capability in natural image domains. However, transferring these representations to medical imaging is challenging due to substantial domain discrepancies. To bridge this gap, parameter-efficient fine-tuning (PEFT) has emerged as a promising strategy to adapt these vision foundation models to medical vision tasks by updating only a small subset of parameters while preserving pretrained knowledge. Despite the efficiency, existing PEFT strategies overlook that pretrained features inherently interleave task-relevant semantics with task-irrelevant patterns and noise, potentially limiting effective adaptation in medical scenarios. To address this challenge, we propose DINOv3-FD, a task-oriented feature disentanglement framework that adapts DINOv3 to medical vision tasks. DINOv3-FD introduces a dual-stream adapter that separates features into task-relevant and task-irrelevant subspaces, reinforced by an orthogonality loss to encourage their mutual independence. Additionally, a distributional regularization loss drives the task-irrelevant branch toward task-agnostic predictions, discouraging it from encoding task-specific semantics. Consequently, the task-relevant stream is encouraged to retain more discriminative representations that facilitate downstream medical tasks. Experimental results show that DINOv3-FD outperforms other PEFT strategies over three medical classification tasks, demonstrating the effectiveness of feature disentanglement.}
}


@InProceedings{pmlr-v315-mahmoud26a,
  title = 	 {On the Feasibility of Fr{é}chet Radiomic Distance–Constrained Adversarial Examples in Medical Imaging: Methods and Trade-offs},
  author =       {Mahmoud, Mohamed and Khaled, Shehab and Elkhayat, Mohamed and Fayyad, Jamil},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2514--2528},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/mahmoud26a/mahmoud26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/mahmoud26a.html},
  abstract = 	 {Adversarial attacks expose critical vulnerabilities in medical imaging AI models; yet, most existing methods violate the textural and structural characteristics that define authentic medical images by disregarding the clinical and radiomic plausibility of the generated perturbations. In this study, we present the first systematic investigation in the existence and feasibility of adversarial examples constrained by the Fr{é}chet Radiomic Distance (FRD) a quantitative measure of radiomic similarity capturing textural, structural, and statistical coherence between images. We formulate a gradient-free, multi objective optimization framework based on Multi Objective Particle Swarm Optimization (MOPSO) operating in the Discrete Cosine Transform (DCT) domain. This framework jointly minimizes FRD and maximizes adversarial deviation, allowing a principled exploration of the trade off between radiomic fidelity and adversarial strength without requiring gradient access. Empirical evidence across multiple medical imaging models demonstrates that enforcing strong FRD constraints (FRD $\leq$ 0.05) dramatically reduces adversarial feasibility. Perturbations preserving radiomic fidelity consistently fail to achieve meaningful adversarial deviation, suggesting that radiomic realism imposes an intrinsic feasibility boundary on adversarial generation. These findings establish radiomic consistency as a fundamental constraint on adversarial vulnerability, offering theoretical and empirical insight toward the development of inherently robust and trustworthy medical imaging AI.}
}


@InProceedings{pmlr-v315-zheng26a,
  title = 	 {Heterogeneous Aligned Fusion for Survival Classification with Missing Modalities},
  author =       {Zheng, Zheng and Guo, Yuzhi and Hu, Xiao and Miao, Yuwei and Ma, Hehuan and Gao, Jean and Huang, Junzhou},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2529--2546},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zheng26a/zheng26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zheng26a.html},
  abstract = 	 {Accurate survival classification is essential for guiding personalized treatment in head and neck cancer. Heterogeneous biomedical data, from histopathology to clinical and laboratory measurements, offer complementary prognostic value but differ in dimensionality, reside in incompatible feature spaces, and are frequently missing, making robust multimodal learning challenging. To address this, we propose HAF (Heterogeneous Aligned Fusion), a three-stage framework for survival classification under heterogeneous and incomplete multimodal inputs. HAF (i) uses detachment and prognostic supervision to obtain stable representations, (ii) performs lightweight global alignment that projects all modalities into a shared latent space while preserving patient-level discriminability, and (iii) enforces monotonic robust fusion that encourages performance to remain stable or improve when modalities are added. To the best of our knowledge, HAF is the first approach that jointly leverages all seven modalities in the HANCOCK cohort. Extensive comparisons against representative late-, early-, attention-based, and bilinear-interaction fusion methods demonstrate that HAF consistently improves both accuracy and robustness under heterogeneous and partially missing modalities.}
}


@InProceedings{pmlr-v315-danaee26a,
  title = 	 {Exploring Entropy-based Active Learning for Fair Brain Segmentation},
  author =       {Danaee, Ghazal and Gaillochet, M\'elanie and Desrosiers, Christian and Lombaert, Herv\'e and Bouix, Sylvain},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2547--2562},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/danaee26a/danaee26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/danaee26a.html},
  abstract = 	 {Active learning (AL) has emerged as a crucial strategy for reducing the prohibitive costs associated with medical image segmentation. However, standard uncertainty-based AL methods typically focus on maximizing performance metrics, ignoring performance disparities or fairness across groups with sensitive attributes. While fair active learning has been explored in classification tasks, its intersection with medical image segmentation remains unaddressed. In this work, we introduced a fairness-aware active learning framework with a Weighted Entropy selection strategy that modulates uncertainty based on current group-specific performance estimates on the labeled set. To decouple true epistemic uncertainty from anatomical volume variances, we further utilized a masked, scaled entropy restricted to the region of interest. The framework was evaluated on synthetic T1-weighted brain MRIs with controlled left caudate bias in both strong and weak bias settings. A 3D U-Net was trained to segment the left caudate under several AL strategies, starting from both demographically balanced and strongly imbalanced initial labeled sets. Experiments demonstrated that our method markedly reduces performance disparities between groups compared to random sampling and standard uncertainty sampling. By prioritizing poorly segmented subgroups during the AL cycles, our method consistently achieved the highest equity-scaled performance and reduced the disparity metric by 75% (strong bias) and 86% (weak bias) relative to standard entropy at the final budget. Overall, this work is among the first studies on fair AL for medical image segmentation, offering an efficient strategy to train more equitable models in resource-constrained environments.}
}


@InProceedings{pmlr-v315-yang26a,
  title = 	 {Explainable Pathomics Feature Visualization via Correlation-aware Conditional Feature Editing},
  author =       {Yang, Yuechen and Guo, Junlin and Deng, Ruining and Zhu, Junchao and Lu, Zhengyi and Qu, Chongyu and Zhu, Yanfan and Guo, Xingyi and Wang, Yu and Zhao, Shilin and Yang, Haichun and Huo, Yuankai},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2563--2581},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/yang26a/yang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/yang26a.html},
  abstract = 	 {Pathomics is a recent approach that offers rich quantitative features beyond what black-box deep learning can provide, supporting more reproducible and explainable biomarkers in digital pathology. However, many derived features (e.g., “second-order moment”) remain difficult to interpret, especially across different clinical contexts, which limits their practical adoption. Conditional diffusion models show promise for explainability through feature editing, but they typically assume feature independence, an assumption violated by intrinsically correlated pathomics features. Consequently, editing one feature while fixing others can push the model off the biological manifold and produce unrealistic artifacts. To address this, we propose a Manifold-Aware Diffusion (MAD) framework for controllable and biologically plausible cell nuclei editing. Unlike existing approaches, our method regularizes feature trajectories within a disentangled latent space learned by a variational auto-encoder (VAE). This ensures that manipulating a target feature automatically adjusts correlated attributes to remain within the learned distribution of real cells. These optimized features then guide a conditional diffusion model to synthesize high-fidelity images. Experiments demonstrate that our approach is able to navigate the manifold of pathomics features when editing those features. The proposed method outperforms baseline methods in conditional feature editing while preserving structural coherence.}
}


@InProceedings{pmlr-v315-qu26a,
  title = 	 {AdaFuse: Adaptive Multimodal Fusion for Lung Cancer Risk Prediction via Reinforcement Learning},
  author =       {Qu, Chongyu and Lu, Zhengyi and Lai, Yuxiang and Li, Thomas Z. and Zhu, Junchao and Guo, Junlin and Xiong, Juming and Zhu, Yanfan and Yang, Yuechen and Luna, Allen J. and Sandler, Kim L. and Landman, Bennett A. and Huo, Yuankai},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2582--2601},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/qu26a/qu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/qu26a.html},
  abstract = 	 {Multimodal fusion has emerged as a promising paradigm for disease diagnosis and prognosis, integrating complementary information from heterogeneous data sources such as medical images, clinical records, and radiology reports. However, existing fusion methods process all available modalities through the network, either treating them equally or learning to assign different contribution weights, leaving a fundamental question unaddressed: for a given patient, should certain modalities be used at all? We present AdaFuse, an adaptive multimodal fusion framework that leverages reinforcement learning (RL) to learn patient-specific modality selection and fusion strategies for lung cancer risk prediction. AdaFuse formulates multimodal fusion as a sequential decision process, where the policy network iteratively decides whether to incorporate an additional modality or proceed to prediction based on the information already acquired. This sequential formulation enables the model to condition each selection on previously observed modalities and terminate early when sufficient information is available, rather than committing to a fixed subset upfront. We evaluate AdaFuse on the National Lung Screening Trial (NLST) dataset. Experimental results demonstrate that AdaFuse achieves the highest AUC (0.762) compared to the best single-modality baseline (0.732), the best fixed fusion strategy (0.759), and adaptive baselines including DynMM (0.754) and MoE (0.742), while using fewer FLOPs than all triple-modality methods. Our work demonstrates the potential of reinforcement learning for personalized multimodal fusion in medical imaging, representing a shift from uniform fusion strategies toward adaptive diagnostic pipelines that learn when to consult additional modalities and when existing information suffices for accurate prediction.}
}


@InProceedings{pmlr-v315-lu26a,
  title = 	 {MASC: Metal-Aware Sampling and Correction via Reinforcement Learning for Accelerated MRI},
  author =       {Lu, Zhengyi and Lu, Ming and Qu, Chongyu and Zhu, Junchao and Guo, Junlin and Lionts, Marilyn and Zhu, Yanfan and Yang, Yuechen and Yao, Tianyuan and Rajagopal, Jayasai and Landman, Bennett Allan and Wang, Xiao and Yan, Xinqiang and Huo, Yuankai},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2602--2620},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/lu26a/lu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/lu26a.html},
  abstract = 	 {Metal implants in MRI cause severe artifacts that degrade image quality and hinder clinical diagnosis. Traditional approaches address metal artifact reduction (MAR) and accelerated MRI acquisition as separate problems. We propose MASC, a unified reinforcement learning framework that jointly optimizes metal-aware k-space sampling and artifact correction for accelerated MRI. To enable supervised training, we construct a paired MRI dataset using physics-based simulation, generating k-space data and reconstructions for phantoms with and without metal implants. This paired dataset provides simulated 3D MRI scans with and without metal implants, where each metal-corrupted sample has an exactly matched clean reference, enabling direct supervision for both artifact reduction and acquisition policy learning. We formulate active MRI acquisition as a sequential decision-making problem, where an artifact-aware Proximal Policy Optimization (PPO) agent learns to select k-space phase-encoding lines under a limited acquisition budget. The agent operates on undersampled reconstructions processed through a U-Net-based MAR network, learning patterns that maximize reconstruction quality. We further propose an end-to-end training scheme where the acquisition policy learns to select k-space lines that best support artifact removal while the MAR network simultaneously adapts to the resulting undersampling patterns. Experiments demonstrate that MASC’s learned policies outperform conventional sampling strategies, and end-to-end training improves performance compared to using a frozen pre-trained MAR network, validating the benefit of joint optimization. Cross-dataset experiments on FastMRI with physics-based artifact simulation further confirm generalization to realistic clinical MRI data.}
}


@InProceedings{pmlr-v315-yao26a,
  title = 	 {VFMStitch: A Vision-Foundation-Model Empowered Framework for 3D Ultrasound Stitching via Geometric–Semantic Feature Fusion},
  author =       {Yao, Xing and DiSanto, Nick and Yu, Runxuan and Wang, Jiacheng and Lu, Daiwei and Arenas, Gabriel and Oguz, Baris and Pouch, Alison and Schwartz, Nadav and Byram, Brett C and Oguz, Ipek},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2621--2639},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/yao26a/yao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/yao26a.html},
  abstract = 	 {3D ultrasound (3DUS) stitching expands the field-of-view (FOV) by registering partially overlapping 3DUS volumes acquired from different probe positions. This task is intrinsically difficult due to large inter-volume translations and rotations, the impact of the sector-shaped FOV, as well as the heavy noise and artifacts inherent to ultrasound. With the rapid progress of Vision Foundation Models (VFMs) such as DINOv3, VFM-derived features have recently shown promise for downstream medical image registration tasks. However, existing VFM-based approaches primarily focus on deformable registration and are rarely evaluated for rigid alignment under large motions. Moreover, the feasibility of leveraging VFM-derived features for robust 3DUS stitching remains largely unexplored. In this study, we introduce VFMStitch, the first training-free, VFM-empowered 3DUS stitching framework that integrates point-cloud (PCD)–based geometric features with DINOv3-derived semantic descriptors. Extensive experiments demonstrate that VFMStitch substantially improves rigid registration accuracy compared to existing methods, validating the effectiveness of geometric–semantic fusion for challenging 3DUS stitching scenarios.}
}


@InProceedings{pmlr-v315-koser26a,
  title = 	 {Attention-to-Survival: Multimodal Fracture Risk Prediction Based on Pelvic Radiographs and Clinical Data from the Study of Osteoporotic Fractures},
  author =       {Koser, Niklas C. and Finck, Marten J. and Jan{\ss}en, Silja and Mouton, Coenraad and Lui, Li-Y. and Cummings, Steven R. and K{\"o}ser, Kevin and H{\"o}vener, Jan-B. and Pirk, S{\"o}ren and Gl{\"u}er, Claus-C.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2640--2665},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/koser26a/koser26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/koser26a.html},
  abstract = 	 {Osteoporotic changes in the hip structure render the proximal femur particularly vulnerable to fractures, which leads to severe consequences for patients’ health and significant socioeconomic burdens, a strongly increasing problem in aging populations. Accurate risk estimation is therefore essential for initiating timely preventive measures. However, the current clinical standard measures bone mineral density (BMD) and the Fracture Risk Assessment Tool (FRAX) provide only limited predictive value. Neither BMD nor FRAX capture structural characteristics that could be derived from pelvic radiographs that are widely available. To address this gap, we present the Attention-to-Survival Fusion (ATSF) model, a multimodal survival analysis framework that combines clinical risk factors (CRFs) with pelvic radiograph features. An attention-based architecture equipped with a deep conditional transformation model (DCTM) prediction head enables accurate estimation of time-dependent fracture risk. The ATSF model is designed to accommodate missing clinical variables, handle all forms of non-informative censoring, and provides modality-specific interpretability through the attention mechanisms. It was developed, validated and tested with data of 7825 women from the Study of Osteoporotic Fractures (SOF) followed for fracture incidence for 23 years. We benchmark ATSF against established baselines, including FRAX, the Cox proportional hazards model (CoxPH), and a deep learning reference model. Our results demonstrate significant superior performance across concordance index (C-index) and area under the receiver operating characteristic curve (AUC), indicating the importance of integrating radiographic and clinical data within a unified survival framework. Furthermore, offering improved interpretability and a scalable multimodal design, the proposed method provides a promising alternative for advancing individualized hip-fracture risk prediction in osteoporosis research and precision medicine.}
}


@InProceedings{pmlr-v315-atici26a,
  title = 	 {From Surface to Viscera: 3D Estimation of Internal Anatomy from Body Surface Point Clouds},
  author =       {Atici, Salih Furkan and Kats, Eytan and Mensing, Daniel and Heinrich, Mattias P},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2666--2681},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/atici26a/atici26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/atici26a.html},
  abstract = 	 {Accurate pre-scan positioning in diagnostic imaging is essential for guiding acquisition and reducing manual calibration time, yet current automated approaches typically rely on dense volumetric representations that are not leveraging the geometric properties or sparsity of surface representations. In this work, we introduce a sparse, point-cloud–based framework for estimating patient-specific 3D locations and shapes of multiple internal organs directly from the body surface. Our method leverages a new dual-encoder PointTransformer architecture: one encoder processes a mean-shape point cloud comprising 20 anatomical structures, while a second encoder extracts features from the patient’s body-surface point cloud. A shared decoder then predicts a deformed shape estimating the hidden individual anatomy patient. This enables accurate organ localization without volumetric rasterization or autoencoder-style bottlenecks. Trained on the German National Cohort (NAKO) dataset, our model substantially outperforms volumetric convolutional autoencoder (CAE) baselines, achieving a mean Chamfer Distance less than 5 mm and markedly lower surface-distance errors. These results demonstrate that sparse geometric learning with deformable point-cloud priors offers an efficient and highly effective alternative improving over dense convolutional deep learning methods for automated imaging workflow optimization.}
}


@InProceedings{pmlr-v315-halimi26a,
  title = 	 {Revealing Hidden Failure Modes in Chest X-ray Classification via Spectral Domain Analysis},
  author =       {Halimi, Samuel and Themyr, Loic and Abreu, Arnaud},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2682--2710},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/halimi26a/halimi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/halimi26a.html},
  abstract = 	 {Deep learning models for chest X-ray anomaly detection remain vulnerable to subtle distributional shifts (e.g., acquisition technique, patient-related factors, and preprocessing). Traditional error analysis often relies on semantic metadata or model embeddings, which can mask low-level signal variations that degrade performance. In this work, we propose a data-centric framework for automated failure mode discovery using spectral analysis. We project images into the frequency domain and extract a compact profile summarizing the distribution of signal energy across frequency bands. By performing unsupervised clustering on these spectral profiles, we demonstrate that model failures are not randomly distributed, but are strongly concentrated within specific spectral clusters. This method effectively isolates "blind spots", enabling the prediction of model reliability and the discovery of performance-degrading data slices without requiring ground-truth failure annotations.}
}


@InProceedings{pmlr-v315-wong26a,
  title = 	 {Weight Space Correlation Analysis: Quantifying Feature Utilization in Deep Learning Models},
  author =       {Wong, Chun Kit and Pegios, Paraskevas and Weng, Nina and Sejer, Emilie Pi Fogtmann and Tolsgaard, Martin Gr{\o}nneb{\ae}k and Christensen, Anders Nymark and Feragen, Aasa},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2711--2737},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wong26a/wong26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wong26a.html},
  abstract = 	 {Deep learning models in medical imaging are susceptible to shortcut learning, relying on confounding metadata (e.g. scanner model) that is often encoded in image embeddings. The crucial question is whether the model actively utilizes this encoded information for its final prediction. We introduce Weight Space Correlation analysis, an interpretable methodology that quantifies feature utilization by measuring the alignment between the classification heads of a primary clinical task and auxiliary metadata tasks. We first validate our method by successfully detecting artificially induced shortcut learning. We then apply it to probe the feature utilization of an SA-SonoNet model trained for Spontaneous Preterm Birth (sPTB) prediction. Our analysis confirmed that while the embeddings contain substantial metadata, the sPTB classifier’s weight vectors were highly correlated with clinically relevant factors (e.g. cervical length) but decoupled from clinically irrelevant acquisition factors (e.g. scanner). Our methodology provides a tool for verifying model trustworthiness, by inspecting whether it utilizes features unrelated to the genuine clinical signal.}
}


@InProceedings{pmlr-v315-khan26a,
  title = 	 {Calibration-Aware Semi-Supervised Fetal Head Segmentation with Boundary-Positive Contrast},
  author =       {Khan, Ufaq and Nawaz, Umair and Ashraf, Tajamul and Saleem, Tausifa Jan and Caputo, Massimo and Narayan, Srinivas Ananth and Bilal, Muhammad and Qadir, Junaid and Haris, Muhammad},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2738--2756},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/khan26a/khan26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/khan26a.html},
  abstract = 	 {Accurate fetal head segmentation in ultrasound is hard to scale as labels are scarce and most errors occur at the head–background interface under speckle, shadowing, and low contrast. We present UltraSemiNet, a teacher–student framework that makes cross–pseudo supervision (CPS) selective via temperature calibration and a dual gate requiring high confidence and test-time augmentation (TTA) stability. We also introduce two boundary-focused modules that complement CPS: SAT, a boundary-positive spatial contrast that learns through ambiguous edges using an entropy belt and a soft-IoU agreement test; and PCM, a prototype-guided curriculum that maintains uncertainty-weighted head/background prototypes and targets feature–prototype discrepancies. Across two datasets (FBUI and HC18), UltraSemiNet improves overlap and boundary metrics over a calibrated CPS baseline (e.g., Dice $0.927{\rightarrow}0.971$; HD95 $7.9{\rightarrow}6.8$\,px), with similar cross-dataset trends. Crucially, the calibrated gate reduces miscalibration of the accepted pseudo-labels: both expected calibration error (ECE) and Brier score decrease overall, with the largest gains within the 0–2\,px boundary band, alongside improvements in pseudo-label accuracy. Ablations show CPS calibration, SAT, and PCM are complementary and concentrate improvements on boundary-sensitive metrics. In a blinded study, UltraSemiNet achieved better segmentation performance than two senior fetal medicine experts when evaluated against the dataset reference masks, indicating the potential to reduce manual refinements.}
}


@InProceedings{pmlr-v315-ozlugedik26a,
  title = 	 {MoA: Mixture of Aggregators Improves Slide-Level Diagnosis in Computational Pathology},
  author =       {Ozlugedik, Fatih and Dasdelen, Muhammed Furkan and Umer, Rao Muhammad and Marr, Carsten},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2757--2779},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/ozlugedik26a/ozlugedik26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/ozlugedik26a.html},
  abstract = 	 {Multiple instance learning (MIL) is the standard for learning slide-level representations from whole slide images (WSIs), typically using a single attention-based aggregator to pool instance features. However, a single aggregator can struggle to capture morphological and compositional patterns of cells in pathology and cytology data, and different diseases may demand different pooling behaviours. We propose a mixture-of-aggregators framework that models complementary aspects of instance distributions in histology and hematologic cytology. A router with top-2 gating dynamically selects the most relevant aggregators per slide, and their outputs are fused into a patient-level representation. To avoid collapse to a single dominant expert aggregator, we add a load-balancing loss and Gumbel noise on the router logits. We evaluate our method on 19 different tasks from 16 datasets including histology and hematologic cytology. Compared to single-aggregator baselines, our approach improves diagnostic prediction accuracy by an average of 4.5% over ABMIL and 12.6% over TransMIL across all tasks. Beyond performance, our analysis shows that different aggregators attend to distinct, disease-specific instance distributions, providing interpretable insights into the diagnostic process.}
}


@InProceedings{pmlr-v315-lin26b,
  title = 	 {Beyond Natural Images: A Dual-Stream DINOv3 Framework for PET/CT Segmentation},
  author =       {Lin, Yu-Nong Scarlett and Wang, Shansong and Safari, Mojtaba and Yang, Xiaofeng},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2780--2794},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/lin26b/lin26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/lin26b.html},
  abstract = 	 {Self-supervised vision transformers like DINOv3 are strong universal feature extractors, yet their transferability to functional medical imaging remains limited when pretrained on misaligned natural-image domains. In this work, we introduce Dual-DINOv3, a dual-stream framework for PET/CT that addresses two key gaps in existing work: the absence of a public, PET-specific pretrained encoder and the reliance on fully paired PET/CT data for multimodal pretraining. First, we presented the first PET-specific DINOv3 encoder, pretrained exclusively on large-scale public FDG-PET datasets using the full three-stage DINOv3 self-distillation pipeline. Second, we proposed a modality-separated PET/CT framework in which PET- and CT-specific encoders are pretrained independently and fused during finetuning via multiscale cross-attention, enabling multimodal representation learning without requiring paired data during pretraining. Evaluation on the HECKTOR tumor segmentation benchmark demonstrated three central findings: (1) misaligned natural-image pretraining degrades PET/CT performance relative to training from scratch, (2) domain-aligned CT pretraining substantially improves segmentation across all tumor sizes, and (3) dual-stream PET/CT pretraining achieves the best performance overall, highlighting the complementary contributions of functional and anatomical cues. Together, these results provide a fully public PET encoder and a scalable PET/CT foundation model that support domain-aligned representation learning under realistic clinical data constraints.}
}


@InProceedings{pmlr-v315-watzenbock26a,
  title = 	 {Chronological Contrastive Learning: Few-Shot Progression Assessment in Irreversible Diseases},
  author =       {Watzenb{\"o}ck, Clemens and Aletaha, Daniel and Deman, Micha{\"e}l and Deimel, Thomas and Eder, Jana and Jan\'{\i}\v{c}kov\'{a}, Ivana and Janiczek, Robert and Mandl, Peter and Seeb{\"o}ck, Philipp and Supp, Gabriela and Weiser, Paul and Langs, Georg},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2795--2817},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/watzenbock26a/watzenbock26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/watzenbock26a.html},
  abstract = 	 {Quantitative disease severity scoring in medical imaging is costly, time-consuming, and subject to inter-reader variability. At the same time, clinical archives contain far more longitudinal imaging data than expert-annotated severity scores. Existing self-supervised methods typically ignore this chronological structure. We introduce ChronoCon, a contrastive learning approach that replaces label-based ranking losses with rankings derived solely from the visitation order of a patient’s longitudinal scans. Under the clinically plausible assumption of monotonic progression in irreversible diseases, the method learns disease-relevant representations without using any expert labels. This generalizes the idea of Rank-N-Contrast from label distances to temporal ordering. Evaluated on rheumatoid arthritis radiographs for severity assessment, the learned representations substantially improve label efficiency. In low-label settings, ChronoCon significantly outperforms a fully supervised baseline initialized from ImageNet weights. In a few-shot learning experiment, fine-tuning ChronoCon on expert scores from only five patients yields an intraclass correlation coefficient of 86% for severity score prediction. These results demonstrate the potential of chronological contrastive learning to exploit routinely available imaging metadata to reduce annotation requirements in the irreversible disease domain.}
}


@InProceedings{pmlr-v315-cheng26a,
  title = 	 {Non-invasive estimation of haemodynamic parameters in pulmonary hypertension — A deep learning approach integrating all B-mode cine loops in an echocardiographic exam},
  author =       {Cheng, Li-Hsin and Alabed, Samer and Charalampopoulos, Athanasios and Goh, Ze Ming and Hameed, Abdul and Holman, Eduard and Kiely, David G. and Salehi, Mahan and Swift, Andrew J. and van der Geest, Rob J.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2818--2833},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/cheng26a/cheng26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/cheng26a.html},
  abstract = 	 {Pulmonary hypertension (PH) is heterogeneous with treatment strategy dependent on the underlying cause and disease severity. Haemodynamic parameters measured through right heart catheterization (RHC) is the gold standard for such diagnosis and desicion making. However, the invasive procedure is associated with a certain level of risk and is not suitable for every patient. Therefore, we seek to investigate whether haemodynamic parameters can be estimated non-invasively using a deep learning approach. The study is based on a retrospective analysis of 833 subjects with suspected PH identified from the ASPIRE research database. Convolutional neural networks were built to integrate B-mode echocardiographic cine loops from multiple views to predict key haemodynamic parameters. The model was able to integrate an arbitrary number of cine loops in the entire exam, unannotated with view names. Additionally, attention weights in feature fusion identify relevant and irrelevant cine loops to the model. The model-predicted mean pulmonary artery pressure (mPAP) correlated to the RHC-ground truth with a Pearson Correlation Coefficient (PCC) of 0.70. Attention weights indicated the apical 4-chamber (A4C) view to be especially relevant for mPAP prediction. Our results demonstrate the feasibility of estimating haemodynamic parameters non-invasively through deep learning models, integrating all B-mode cine loops of a cardiac ultrasound exam, achieving a moderate correlation to RHC measurements.}
}


@InProceedings{pmlr-v315-posada26a,
  title = 	 {Semi-Synthetic Localization Datasets for Radiological Findings on Chest X-Rays},
  author =       {Posada, Andrea and Brandt, Johannes and Jungmann, Friederike and Posada, Maria and Rueckert, Daniel and Menten, Martin J. and Meissen, Felix and M\"uller, Philip},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2834--2863},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/posada26a/posada26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/posada26a.html},
  abstract = 	 {While large datasets for chest X-ray (CXR) finding classification are widely available, datasets for finding localization are scarce. Curating these localization datasets is costly and time-intensive, requiring manual annotation by medical experts, which often results in them being small and limited in scope. To overcome this, we introduce SemiSynCXR, a framework designed to automatically generate semi-synthetic localization datasets. SemiSynCXR operates by inpainting specific radiological findings into real, healthy CXRs at anatomically plausible locations, which allows for the output of both the edited image and the ground-truth bounding box for each finding. SemiSynCXR-generated CXRs effectively augment existing localization datasets, yielding relative mAP$_{10:70}$ gains of up to 11% on in-domain and 21% on out-of-domain data, thereby mitigating data scarcity and improving generalization. Comprehensive quantitative and qualitative evaluations show that our framework achieves an overall AUROC of 0.78 and mAP$_{10:70}$ of 0.45, comparable to fully synthetic benchmarks. These results confirm that the generated findings are realistic and accurately localized, establishing SemiSynCXR as a practical solution for the generation of CXR finding localization datasets.}
}


@InProceedings{pmlr-v315-vilouras26a,
  title = 	 {Anatomy-Grounded Weakly Supervised Prompt Tuning for Chest X-ray Latent Diffusion Models},
  author =       {Vilouras, Konstantinos and Stogiannidis, Ilias and Yan, Junyu and Smithard, Alison Q. and Tsaftaris, Sotirios A.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2864--2892},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/vilouras26a/vilouras26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/vilouras26a.html},
  abstract = 	 {Latent Diffusion Models have shown remarkable results in text-guided image synthesis in recent years. In the domain of natural (RGB) images, recent works have shown that such models can be adapted to various vision-language downstream tasks with little to no supervision involved. On the contrary, text-to-image Latent Diffusion Models remain relatively underexplored in the field of medical imaging, primarily due to limited data availability (e.g., due to privacy concerns). In this work, focusing on the chest X-ray modality, we first demonstrate that a standard text-conditioned Latent Diffusion Model has not learned to align clinically relevant information in free-text radiology reports with the corresponding areas of the given scan. Then, to alleviate this issue, we propose a fine-tuning framework to improve multi-modal alignment in a pre-trained model such that it can be efficiently repurposed for downstream tasks such as phrase grounding. Our method sets a new state-of-the-art on a standard benchmark dataset (MS-CXR), while also exhibiting robust performance on out-of-distribution data (VinDr-CXR). We further validate our approach through a pilot qualitative study and an experiment on grounded disease classification.}
}


@InProceedings{pmlr-v315-le26b,
  title = 	 {ConStruct: Structural Distillation of Foundation Models for Prototype-Based Weakly Supervised Histopathology Segmentation},
  author =       {Le, Khang and Thach, Ha and Vu, Anh M. and Vo, Trang T. K. and Huynh, Han H. and Yang, David and Le, Minh H. N. and Nguyen, Thanh-Huy and Awasthi, Akash and Mohan, Chandra and Han, Zhu and Nguyen, Hien Van},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2893--2905},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/le26b/le26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/le26b.html},
  abstract = 	 {Weakly supervised semantic segmentation (WSSS) in histopathology relies heavily on classification backbones, yet these models often localize only the most discriminative regions and struggle to capture the full spatial extent of tissue structures. Vision–language models such as CONCH offer rich semantic alignment and morphology-aware representations, while modern segmentation backbones like SegFormer preserve fine-grained spatial cues. However, combining these complementary strengths remains challenging, especially under weak supervision and without dense annotations. We propose a prototype learning framework for WSSS in histopathological images that integrates morphology-aware representations from CONCH, multi-scale structural cues from SegFormer, and text-guided semantic alignment to produce prototypes that are simultaneously semantically discriminative and spatially coherent. To effectively leverage these heterogeneous sources, we introduce text-guided prototype initialization that incorporates pathology descriptions to generate more complete and semantically accurate pseudo-masks. A structural distillation mechanism transfers spatial knowledge from SegFormer to preserve fine-grained morphological patterns and local tissue boundaries during prototype learning. Our approach produces high-quality pseudo masks without pixel-level annotations, improves localization completeness, and enhances semantic consistency across tissue types. Experiments on BCSS-WSSS datasets demonstrate that our prototype learning framework outperforms existing WSSS methods while remaining computationally efficient through frozen foundation model backbones and lightweight trainable adapters.}
}


@InProceedings{pmlr-v315-soraki26a,
  title = 	 {CrossFusion: A Multi-Scale Cross-Attention Convolutional Fusion Model for Cancer Survival Prediction},
  author =       {Soraki, Rustin and Wang, Huayu and Liu, Sitong and Elmore, Joann G. and Shapiro, Linda},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2906--2921},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/soraki26a/soraki26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/soraki26a.html},
  abstract = 	 {Cancer survival prediction from whole slide images (WSIs) relies on capturing prognostic features spanning multiple magnifications, from global tissue architecture to fine-grained cellular morphology. However, current approaches typically face two main limitations: most frameworks focus heavily on single-scale analysis, thereby overlooking the hierarchical context of tissue; meanwhile, existing multi-scale methods often employ simplistic fusion mechanisms (e.g., direct concatenation) that fail to model effective cross-scale interactions. To address these challenges, we propose CrossFusion, a novel multi-scale architecture that introduces a convolutional fusion processor to perform rigorous scale–space integration. Evaluated on six TCGA cancer cohorts, CrossFusion achieves state-of-the-art C-index performance, consistently outperforming both strong single-scale and multi-scale baselines. Furthermore, leveraging domain-specific pathology feature extractors yields additional gains in prognostic accuracy compared to general-purpose backbones.}
}


@InProceedings{pmlr-v315-kumar26b,
  title = 	 {PIKACHU: Prototypical In-context Knowledge Adaptation for Clinical Heterogeneous Usage},
  author =       {Kumar, Amar and TehraniNasab, Zahra and Kaczmarek, Emily and Arbel, Tal},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2922--2940},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/kumar26b/kumar26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/kumar26b.html},
  abstract = 	 {Medical imaging systems increasingly rely on large vision language foundation models (VLFMs) trained on diverse biomedical corpora, yet these models remain difficult to adapt to new clinical tasks without costly fine-tuning and large annotated datasets. We present (Prototypical In-Context Knowledge Adaptation for Clinical Heterogeneous Usage), a lightweight and generalizable framework that enables rapid few-shot adaptation of frozen medical FMs using only a handful of labelled examples. Unlike prior approaches that modify backbone weights or introduce heavy attention-based adapters, PIKACHU performs all task adaptation directly in the FM feature space through in-context prototypical reasoning. Given a small support set, the framework constructs class prototypes by averaging normalized embeddings from a frozen VLFM image encoder and performs prediction on query images using temperature-scaled cosine similarity. Only a single temperature parameter is learned. We evaluate PIKACHU across three heterogeneous medical imaging datasets - dermatological images (ISIC), Optical Coherence Tomography (OCT), and Diabetic Retinopathy (DR), using established vision models (SigLIP, PubMedCLIP, DinoV2, and ViT) as backbones. The proposed in-context learning (ICL) strategy consistently outperforms the baseline (zero-shot) approaches across all datasets and architectures, achieving substantial improvements in both accuracy and AUC. Notably, with PubMedCLIP as the backbone, PIKACHU achieves 0.69 accuracy on the ISIC dataset, 0.72 on OCT, and 0.79 on DR, demonstrating robust generalization across diverse clinical imaging modalities. These results highlight the promise of feature-space in-context learning as an efficient and deployable paradigm for test-time adaptation of foundation models, without the need for extensive retraining.}
}


@InProceedings{pmlr-v315-zhuang26a,
  title = 	 {From Cross-Sectional CT to Dynamic Insights: Pseudotime-Based Modeling of Lung Nodule Progression},
  author =       {Zhuang, Luoting and Tran, Linh M. and Zhu, Yunzheng and Prosper, Ashley E. and Hsu, William},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2941--2957},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zhuang26a/zhuang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zhuang26a.html},
  abstract = 	 {Early detection of lung cancer relies on a comprehensive understanding of the progression of pulmonary nodules. Existing longitudinal modeling approaches are constrained due to the limited availability of longitudinal datasets and the failure to capture the inter-nodular relationship. In this study, we present one of the first applications of pseudotime inference, adapted from single-cell RNA sequencing studies, to reconstruct progression trajectories of nodules from cross-sectional CT images. We collected 13,626 nodule snapshots from two screening cohorts and reserved a longitudinal test set for evaluation. We compared a graph-based pseudotime method, diffusion pseudotime, and an unsupervised deep learning framework combining a variational autoencoder and a neural ordinary differential equation. Both approaches demonstrate longitudinal consistency, with malignant nodules showing a higher correlation between pseudotime and actual time. Pseudotime aligns with clinically relevant features such as irregular margins and solid consistency. Furthermore, pseudotime and delta-pseudotime effectively stratify nodules into distinct malignancy risk groups and remain significant independent predictors of malignancy after adjusting for established semantic biomarkers. Our study highlights pseudotime inference as a promising tool for dynamic modeling of lesion progression using static imaging data.}
}


@InProceedings{pmlr-v315-liu26d,
  title = 	 {A Tool Bottleneck Framework for Clinically-Informed and Interpretable Medical Image Understanding},
  author =       {Liu, Christina and Wang, Alan Q. and Hsu, Joy and Wu, Jiajun and Adeli, Ehsan},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2958--2986},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/liu26d/liu26d.pdf},
  url = 	 {https://proceedings.mlr.press/v315/liu26d.html},
  abstract = 	 {Recent tool-use frameworks powered by vision-language models (VLMs) improve image understanding by grounding model predictions with specialized tools. Broadly, these frameworks leverage VLMs and a pre-specified toolbox to decompose the prediction task into multiple tool calls (often deep learning models) which are composed to make a prediction. The dominant approach to composing tools is using text, via function calls embedded in VLM-generated code or natural language. However, these methods often perform poorly on medical image understanding, where salient information is encoded as spatially-localized features that are difficult to compose or fuse via text alone. To address this, we propose a tool-use framework for medical image understanding called the (), which composes VLM-selected tools using a learned Tool Bottleneck Model (TBM). For a given image and task, leverages an off-the-shelf medical VLM to select tools from a toolbox that each extract clinically-relevant features. Instead of text-based composition, these tools are composed by the TBM, which computes and fuses the tool outputs using a neural network before outputting the final prediction. We propose a simple and effective strategy for TBMs to make predictions with any arbitrary VLM tool selection. Overall, our framework not only improves tool-use in medical imaging contexts, but also yields more interpretable, clinically-grounded predictors. We evaluate on tasks in histopathology and dermatology and find that these advantages enable our framework to perform on par with or better than deep learning-based classifiers, VLMs, and state-of-the-art tool-use frameworks, with particular gains in data-limited regimes.}
}


@InProceedings{pmlr-v315-bhandari26a,
  title = 	 {Automated Quality Assessment of Blind Sweep Obstetric Ultrasound for Improved Diagnosis},
  author =       {Bhandari, Prasiddha and Poudel, Kanchan and Luitel, Nishant and Acharya, Bishram and Ghimire, Angelina and Wellman, Tyler and Koepsell, Kilian and Regmi, Pradeep Raj and Khanal, Bishesh},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2987--2997},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/bhandari26a/bhandari26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/bhandari26a.html},
  abstract = 	 {Blind Sweep Obstetric Ultrasound (BSOU) enables scalable fetal imaging in low-resource settings by allowing minimally trained operators to acquire standardized sweep videos for automated Artificial Intelligence(AI) interpretation. However, the reliability of such AI systems depends critically on the quality of the acquired sweeps, and little is known about how deviations from the intended protocol affect downstream predictions. In this work, we present a systematic evaluation of BSOU quality and its impact on three key AI tasks: sweep-tag classification, fetal presentation classification, and placenta-location classification. We simulate plausible acquisition deviations, including reversed sweep direction, probe inversion, and incomplete sweeps, to quantify model robustness, and we develop automated quality-assessment models capable of detecting these perturbations. To approximate real-world deployment, we simulate a feedback loop in which flagged sweeps are “re-acquired”, showing that such correction improves downstream task performance. Our findings highlight the sensitivity of BSOU-based AI models to acquisition variability and demonstrate that automated quality assessment can play a central role in building reliable, scalable AI-assisted prenatal ultrasound workflows, particularly in low-resource environments.}
}


@InProceedings{pmlr-v315-buess26a,
  title = 	 {ALO: Addressing Class Imbalance in Radiology Report Generation through Anatomy-Level Oversampling},
  author =       {Buess, Lukas and Kurin, Robert and Bhandary Panambur, Adarsh and Arias-Vergara, Tomas and Maier, Andreas},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {2998--3017},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/buess26a/buess26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/buess26a.html},
  abstract = 	 {Radiology report generation aims to connect visual understanding with clinical language, yet most methods rely on free-text supervision, which is highly variable and difficult to evaluate. Clinical datasets are also dominated by normal findings, causing models to underreport abnormalities. While recent works focus on architectural advances, we show that structured supervision and balanced sampling can yield substantial gains in clinical performance. We convert free-text reports into structured anatomy-level representations and introduce Anatomy-Level Oversampling (ALO), a data centered sampling strategy that balances normal and abnormal findings for each anatomical region. This structure provides consistent supervision and enables more informative evaluation. Across three public datasets, ALO improves sensitivity to pathological findings while remaining fully model agnostic. On internal validation, ALO increases F1-Score by 50% and CRG by 5.8%, and on external validation, it increases F1-Score by 45.1% and CRG by 5%. These results highlight the importance of structured data and balanced sampling for reliable report generation.}
}


@InProceedings{pmlr-v315-hashisho26a,
  title = 	 {Enforcing 3D Coherence in Semi-Supervised Segmentation for Pancreatic Tumor Histopathology from Light Sheet Fluorescence Microscopy},
  author =       {Hashisho, Yousif and Pinkert-Leetsch, Diana and Missbach-Guentner, Jeannine},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3018--3035},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/hashisho26a/hashisho26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/hashisho26a.html},
  abstract = 	 {Light sheet fluorescence microscopy (LSFM) provides unprecedented two-dimensional (2D) tomographic views and three-dimensional (3D) reconstructions of tissue volumes, but generates such large data sets that complete annotation is not feasible. This results in volumes with sparse axial annotations, where ground truth is available for only a small fraction of slices. Standard semi-supervised learning (SSL) methods often fail in this regime, unable to bridge the large gaps between labeled slices to produce coherent 3D segmentations. To address this, we propose a novel SSL framework designed to enforce 3D anatomical plausibility from sparse 2D supervision. The core of our contribution is an axial continuity loss, a regularization term that enforces prediction consistency between adjacent unlabeled slices. This loss is integrated into a voxel-aware Mean-Teacher framework that effectively leverages abundant unlabeled data. We validate our approach on a 3D LSFM dataset of human pancreatic ductal adenocarcinoma (PDAC), which we collected and sparsely annotated for this study. Our experiments show that standard SSL baselines degrade in performance as annotations become sparser, producing noisy predictions between labeled slices. In contrast, our full framework, which integrates an attention-gated 3D U-Net with our proposed continuity loss, maintains robust 3D coherence even in low-data regimes, enabling reliable histopathological analysis from minimal annotations.}
}


@InProceedings{pmlr-v315-butsanets26a,
  title = 	 {RadImageNet-VQA: A Large-Scale CT and MRI Dataset for Radiologic Visual Question Answering},
  author =       {Butsanets, L{\'e}o and Corbi{\`e}re, Charles and Khlaut, Julien and Manceron, Pierre and Dancette, Corentin},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3036--3068},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/butsanets26a/butsanets26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/butsanets26a.html},
  abstract = 	 {In this work, we introduce RadImageNet-VQA, a large-scale dataset designed to advance radiologic visual question answering (VQA) on CT and MRI exams. While existing medical VQA datasets are limited in scale, dominated by X-ray imaging or biomedical illustrations, and prone to text-based shortcuts, RadImageNet-VQA is built from expert-curated annotations and provides 750K images paired with 7.5M QA samples. It covers three key tasks—abnormality detection, anatomy recognition, and pathology identification—spanning 8 anatomical regions and 97 pathology categories, and supports open-ended, closed-ended, and multiple-choice questions. Extensive experiments show that state-of-the-art vision-language models still struggle with fine-grained pathology identification, especially in open-ended settings and even after fine-tuning. Text-only analysis further reveals that model accuracies collapse to near-random without image inputs, confirming that RadImageNet-VQA is free from linguistic shortcuts.}
}


@InProceedings{pmlr-v315-le-gia26a,
  title = 	 {Training-Free Zero-Shot Anomaly Detection in 3D Brain MRI with 2D Foundation Models},
  author =       {Le Gia, Tai and Ahn, Jaehyun},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3069--3088},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/le-gia26a/le-gia26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/le-gia26a.html},
  abstract = 	 {Zero-shot anomaly detection (ZSAD) has gained increasing attention in medical imaging as a way to identify abnormalities without task-specific supervision, but most advances remain limited to 2D datasets. Extending ZSAD to 3D medical images has proven challenging, with existing methods relying on slice-wise features and vision–language models, which fail to capture volumetric structure. In this paper, we introduce a fully training-free framework for ZSAD in 3D brain MRI that constructs localized volumetric tokens by aggregating multi-axis slices processed by 2D foundation models. These 3D patch tokens restore cubic spatial context and integrate directly with distance-based, batch-level anomaly detection pipelines. The framework provides compact 3D representations that are practical to compute on standard GPUs and require no fine-tuning, prompts, or supervision. Our results show that training-free, batch-based ZSAD can be effectively extended from 2D encoders to full 3D MRI volumes, offering a simple and robust approach for volumetric anomaly detection.}
}


@InProceedings{pmlr-v315-gebauer26a,
  title = 	 {Robust Multi-Scale Implicit Neural Representations for Large-Deformation Lung Registration},
  author =       {Gebauer, Johannes B. and Nielsen, Maximilian and Madesta, Frederic and Werner, Ren{\'e} and Sentker, Thilo},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3089--3102},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/gebauer26a/gebauer26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/gebauer26a.html},
  abstract = 	 {We propose a multi-scale Implicit Neural Representation (INR) framework for dense deformable image registration, designed to stabilize convergence for large deformations while preserving precision for fine anatomical details. We model the INR as a dual-branch architecture that explicitly decomposes the motion into global and local components. The objective function is driven by mask-guided Normalized Cross-Correlation augmented by geometric and semantic regularization to ensure smooth, anatomically plausible motion. Evaluation on the DIR-Lab 4DCT thorax dataset demonstrates competitive performance with a mean Target Registration Error (TRE) below 1.0\,mm. On the more challenging DIR-Lab COPDgene thorax dataset, the model achieves robust alignment with a mean TRE of 1.23\,mm, yielding performance comparable to leading classical optimization frameworks. A comprehensive ablation study confirms that the dual-branch design and multi-scale optimization strategy are necessary to achieve these results, enabling stable registration with modest computational overhead.}
}


@InProceedings{pmlr-v315-anglada-rotger26a,
  title = 	 {Evidential DualU-Net: Single-Pass Uncertainty for Cell Instance Segmentation},
  author =       {Anglada-Rotger, David and Marques, Ferran and Pard{\`a}s, Montse},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3103--3130},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/anglada-rotger26a/anglada-rotger26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/anglada-rotger26a.html},
  abstract = 	 {Accurate and trustworthy cell instance segmentation requires models that not only detect and classify nuclei but also communicate how much evidence supports each prediction. DualU-Net is a fast and effective two-head multi-task architecture for this problem, but—like most deterministic models—it provides no principled uncertainty estimates. We introduce Evidential DualU-Net, the first evidential framework for multi-task cell instance segmentation.Its segmentation head predicts Dirichlet concentration parameters, enabling single-pass, closed-form aleatoric, epistemic, and vacuity uncertainties at the pixel level, with instance-level quantities obtained via size-invariant pooling of pixel evidence. The centroid decoder is complemented with two lightweight geometric uncertainty cues that quantify localisation reliability without auxiliary models or sampling. Together, these evidential and geometric measures expose complementary failure modes and allow principled filtering of low-confidence nuclei. Across multi-tissue and multi-stain datasets, Evidential DualU-Net matches or surpasses deep ensembles and MC Dropout in error separation at a fraction of the cost, maintains or improves calibration over deterministic baselines, and generalises across datasets without retuning. This work provides an interpretable and computationally practical uncertainty formulation for digital pathology.}
}


@InProceedings{pmlr-v315-kshirsagar26a,
  title = 	 {Learning Structure-Aware Foundational Representation of Rat Testicular Tubules Using Multiple Instance Learning},
  author =       {Kshirsagar, Vedang and Juturu, Saketh and Raipuria, Geetank and Singhal, Nitin},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3131--3151},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/kshirsagar26a/kshirsagar26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/kshirsagar26a.html},
  abstract = 	 {Testicular toxicity is a critical factor in preclinical drug safety assessment, yet automated modelling of testicular abnormalities remains largely unexplored. Unlike liver or kidney tissue, the testis tissue is organized into tubules that vary substantially in size and structure, making fixed-resolution patch classification ineffective. We first demonstrate that resizing tubules significantly degrades performance particularly for larger sized tubules and a Multiple Instance Learning (MIL) model offers substantial improvements. Building on this, we introduce TBA-MIL, a transformer-based aggregation model with learnable positional embeddings that encodes the structure of tubules and is pre-trained using a self-supervised Masked Instance Modelling (MIM-MIL) framework, learning tubule representations from large-scale unlabeled data. Across four tubule types, TBA-MIL with MIM-MIL outperforms state-of-the-art MIL models and establishes a strong baseline for automated testicular toxicity assessment. Additionally, we evaluate the proposed framework on an independent toxicological study and show that the predicted abnormality distributions significantly differentiate control and treated animal tissues, consistent with expert pathologists’ assessment.}
}


@InProceedings{pmlr-v315-saboo26a,
  title = 	 {Anatomical Longitudinal Cortical Surface Registration},
  author =       {Saboo, Aakash and Davies, Ashleigh and Baena, Nashira and Liang, Kaili and Xiao, Jiaxin and Guo, Yourong and Basenczi, Renato and O'Muircheartaigh, Jonathan and Robinson, Emma},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3152--3173},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/saboo26a/saboo26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/saboo26a.html},
  abstract = 	 {Longitudinal cortical surface registration is essential for accurately characterizing developmental and neurodegenerative trajectories, thereby facilitating a mechanistic understanding of cortical growth and the identification of biomarkers. This is hindered by current registration networks, which works on spherical projections of the cortical surface. Therefore, In this work, we present a novel longitudinal registration framework that operates directly on complex anatomical geometries by integrating a learning-based network with pairwise instance optimization. This hybrid strategy leverages the network to establish a robust initial alignment, which is subsequently refined through optimization to ensure high-fidelity registration. We demonstrate that this method yields growth maps with superior smoothness compared to baselines, enhancing their clinical utility, while rigorously preserving topological integrity as evidenced by analyses of self-intersecting faces, areal distortion, and anisotropic strain.}
}


@InProceedings{pmlr-v315-dhor26a,
  title = 	 {TUNE++: Topology-Guided Uncertainty Estimation for Reliable 3D Medical Image Segmentation},
  author =       {Dhor, Ashim and Banerjee, Abhirup and Basu, Tanmay},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3174--3207},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/dhor26a/dhor26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/dhor26a.html},
  abstract = 	 {Deep learning models for medical image segmentation lack mechanisms to assess their own reliability, leading to two critical failures: they provide no uncertainty estimates to distinguish confident predictions from error-prone ones, and often produce anatomically implausible segmentations or incorrect connectivity that violate known structural constraints. We observe that uncertainty and topology are intrinsically linked and anatomically complex regions naturally exhibit higher prediction uncertainty, while uncertain predictions require stronger enforcement of structural constraints. Building on this insight, we propose TUNE++, a unified framework that jointly learns segmentation, uncertainty quantification, and topology preservation through a novel Topology-Uncertainty aware Paired Attention (TUPA) mechanism. Our method decomposes uncertainty into aleatoric and epistemic components while simultaneously enforcing anatomical correctness through persistent homology-based constraints. A key innovation is our topology-uncertainty alignment loss that minimizes the discrepancy between predicted total uncertainty and a topological complexity score computed from organ boundaries, multi-organ junction counts, and critical points extracted from persistence diagrams, teaching the model to be uncertain precisely where anatomical structure is geometrically complex. Our empirical results demonstrate that joint modeling of TUNE++ produced enhanced segmentation accuracy, well-calibrated uncertainty estimates that successfully identify errors, substantial reduction in topological violations, and learned confidence that correlates strongly with anatomical complexity.}
}


@InProceedings{pmlr-v315-irshad26a,
  title = 	 {No Evidence of Disease: Clinically-Risky Adversarial Chest CT Report Generation},
  author =       {Irshad, Samra and Kim, Junho and Kim, Seong Tae},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3208--3229},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/irshad26a/irshad26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/irshad26a.html},
  abstract = 	 {Automated chest CT radiology report generation has equipped clinicians with the ability to automatically describe clinical findings and abnormalities from CT scans. Given that patient prognosis relies heavily on these reports, generating an accurate CT report is critical. Advances in Multimodal Large Language Models (MLLMs) have enabled substantial improvements in CT-to-text report generation models, yet recent studies show that MLLMs are highly susceptible to adversarial perturbations. Beyond this known susceptibility, it remains unclear what triggers clinically dangerous attack scenarios during medical report generation. Understanding such threats is essential for developing robust medical AI systems without a clear characterization of the threat, it is challenging to mitigate real-world risks. In this paper, we investigate how chest CT report generation models can be adversarially manipulated and what constitutes an adversarial CT report. We introduce Clinically Risky Adversarial Report Generation (CRA-RG), a threat model that defines clinically realistic adversarial alterations to chest CT reports. To instantiate this threat model, we develop a targeted multimodal attack that perturbs both CT volumes and conditioning text prompts to induce clinically risky changes in reports. We show that our attack can successfully omit and fabricate clinically grounded high-risk CT chest findings (e.g., nodules or lesions). To the best of our knowledge, our study is the first empirical demonstration that state-of-the-art CT report generation models can be deceived into producing harmful clinical decisions, potentially leading to missed diagnoses or unnecessary biopsies. We evaluate our attack on two state-of-the-art CT report generation models using the publicly available chest 3D CT RadGenome dataset.}
}


@InProceedings{pmlr-v315-lawrence26a,
  title = 	 {Geometry-Aware Depth-Guided Explainable Multimodal Polyp Size Estimation: A Fusion Model Beyond RGB},
  author =       {Lawrence, Krispian and Goparaju, Usha and Lamb, Luis},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3230--3244},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/lawrence26a/lawrence26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/lawrence26a.html},
  abstract = 	 {Accurately estimating the physical size of colorectal polyps from monocular endoscopy is difficult due to scale ambiguity, viewpoint distortions, and strong inter-patient variability. We introduce MPSE, a geometry-aware, depth-guided multimodal framework that jointly leverages RGB appearance, monocular depth cues, and interpretable geometry descriptors to produce reliable and clinically calibrated size estimates. Central to MPSE is a geometry-as-query fusion block that selectively attends to depth and RGB features, and a Scale Consistency Block (SCB) that models agreement between 2D footprint–derived and 3D depth–derived cues, reducing size bias under severe distribution imbalance. The model is trained with a primary regression objective supported by an auxiliary threshold-based classification loss that stabilizes predictions near clinically important cutoffs. On our clinical dataset, MPSE achieves a mean absolute error of 0.93\,mm and a polyp-level F1 score of 0.87 at the clinically critical 5\,mm threshold, demonstrating accurate and clinically reliable size estimation in endoscopy.}
}


@InProceedings{pmlr-v315-kartika26a,
  title = 	 {Efficient Self-Supervised Adaptation of 3D Abdominal Vision-Language Model for Institution-Specific HCC Classification via Full Fine-Tuning and PEFT},
  author =       {Kartika, Febryan Putra and Ma, Cheng-Yu and Lin, Ying-Jia and Cheng, Chi-Tung and Chen, Kuan-Fu},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3245--3269},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/kartika26a/kartika26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/kartika26a.html},
  abstract = 	 {Medical vision-language models (VLMs) have demonstrated a strong capability in capturing cross-modal relationships between image and text, yet their adaptation to institution-specific clinical tasks remains underexplored. In this study, we fine-tuned a pretrained 3D medical VLM for hepatocellular carcinoma (HCC) classification using paired abdominal CT scans and radiology reports from a different institution and with acquisition characteristics that differ from the model’s original pretraining corpus. We compared two adaptation strategies: full fine-tuning and parameter-efficient fine-tuning (PEFT), motivated by the common use of PEFT to reduce computational cost and enable adaptation under limited-data constraints. Both approaches achieve strong downstream HCC classification performance despite the cross-institutional domain shift, with PEFT reaching an AUC of 0.94 and F1 of 0.91, and full fine-tuning achieving an AUC of 0.95 and F1 of 0.90. These results are competitive with, and in some settings exceed, previously reported supervised HCC classification approaches that rely on lesion-level annotation or segmentation. Full fine-tuning converges rapidly but overfits within a few epochs, whereas PEFT (ConvLoRA for the image encoder and LoRA for the text encoder) attains comparable performance while updating only $\sim$1% of the model parameters, although requiring more training steps. To better understand adaptation behavior, we also examine the role of contrastive temperature, observing that temperature initialization significantly affects classification performance. This study demonstrates that 3D medical VLM can be efficiently adapted to institution-specific HCC classification using self-supervised CT-report contrastive learning, while highlighting the practical trade-offs between full fine-tuning and parameter-efficient fine-tuning.}
}


@InProceedings{pmlr-v315-grove26a,
  title = 	 {More is more: leveraging multi-rater information for whole slide images grading via virtual expert panel},
  author =       {Grove, Jan and Botros, Michel and Weeda, Ylva A and S{\'a}nchez, Clara I. and Bekkers, Erik and Meijer, Sybren L and Kervadec, Hoel},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3270--3282},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/grove26a/grove26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/grove26a.html},
  abstract = 	 {In medical imaging, datasets with several expert diagnoses capture diagnostic uncertainty, yet many approaches compress diagnoses into a single consensus label. Due to its highly subjective nature, Barret’s Esophagus gradings often diverge, thus necessitating several expert opinions to mitigate variation in diagnostic or treatment outcomes. Using a multi-rater dataset from the Dutch Esophageal Pathology Panel, we propose an approach to tackle the implied issues such as poor calibration and overconfident predictions that come with a compressed label. We offer an approach that models individual rater behaviors as part of virtual panels, allowing for better prediction performance while also improving the quality of uncertainty estimates for clinical decision-making when compared to pre-compressed labels. We show that due to their individual correlation with the clinical consensus, a combination of raters—especially an inclusion of all raters—yields higher performance and better calibrated predictions.}
}


@InProceedings{pmlr-v315-monnin26a,
  title = 	 {Explainable HCC Diagnosis on Dynamic Contrast-Enhanced MRI with a Li-RADS Concept Bottleneck},
  author =       {Monnin, Killian and Jeltsch, Patrick and Fernandes-Mendes, Lucia and Cazzagon, Vasco and Y{\"u}ce, Murat and Yadav, Vivek and Jreige, Mario and Gulizia, Marianna and Fraga Christinet, Montserrat and Girardet, Rapha{\"e}l and Dromain, Clarisse and Taouli, Bachir and Vietti-Violi, Na{\"i}k and Richiardi, Jonas},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3283--3313},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/monnin26a/monnin26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/monnin26a.html},
  abstract = 	 {We propose an explainable end-to-end framework for hepatocellular carcinoma (HCC) diagnosis on dynamic contrast-enhanced (DCE) liver MRI. Our method embeds Liver Imaging Reporting and Data System (Li-RADS)–inspired concepts into the network via a multi-head concept bottleneck. A 2.5D EfficientNet backbone processes lesion-centred multiphase MRI crops, and a 4-head architecture jointly predicts continuous soft labels for non-rim arterial phase hyperenhancement (APHE), portal venous/delayed washout and capsule, lesion morphology, and a LR-5 score (definite HCC vs non-HCC) based on the Li-RADS guidelines. Soft labels are derived automatically from intra-lesional, peri-lesional and parenchymal intensity patterns, and the network is trained with uncertainty-weighted losses to balance concept prediction, contrast regression and HCC classification. On our cohort, the Li-RADS–inspired bottleneck substantially improves NormGrad explanation accuracy, geometric stability and intensity robustness while maintaining PR AUC comparable to a single-head baseline, highlighting an interpretable alternative to a black-box HCC classifier.}
}


@InProceedings{pmlr-v315-moonemans26a,
  title = 	 {Democratising Pathology Co-Pilots: An Open Pipeline and Dataset for Whole-Slide Vision-Language Modelling},
  author =       {Moonemans, Sander and Ram, Sebastiaan and Meeuwsen, Fr{\'e}d{\'e}rique and Lems, Carlijn and van der Laak, Jeroen and Litjens, Geert and Ciompi, Francesco},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3314--3335},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/moonemans26a/moonemans26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/moonemans26a.html},
  abstract = 	 {Vision-language models (VLMs) have the potential to become co-pilots for pathologists. However, most VLMs either focus on small regions of interest within whole-slide images, provide only static slide-level outputs, or rely on data that is not publicly available, limiting reproducibility. Furthermore, training data containing WSIs paired with detailed clinical reports is scarce, restricting progress toward transparent and generalisable VLMs. We address these limitations with three main contributions. First, we introduce Polysome, a standardised tool for synthetic instruction generation. Second, we apply Polysome to the public HISTAI dataset, generating HISTAI-Instruct, a large whole-slide instruction tuning dataset spanning 24,259 slides and over 1.1 million instruction-response pairs. Finally, we use HISTAI-Instruct to train ANTONI-$\alpha$, a VLM capable of visual-question answering (VQA). We show that ANTONI-$\alpha$ outperforms MedGemma on WSI-level VQA tasks of tissue identification, neoplasm detection, and differential diagnosis. We also compare the performance of multiple incarnations of ANTONI-$\alpha$ trained with different amounts of data. All methods, data, and code are publicly availablefn:antoni$^,$ fn:polysome$^,$ fn:histai-instruct.}
}


@InProceedings{pmlr-v315-chraki26a,
  title = 	 {Counterfactual Intervention in Attention Multiple Instance Learning For Digital Pathology},
  author =       {Chraki, Imane and Marza, Pierre and Christodoulidis, Stergios and Vakalopoulou, Maria},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3336--3354},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/chraki26a/chraki26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/chraki26a.html},
  abstract = 	 {Attention-based Multiple Instance Learning (MIL) has become a prominent framework for analysing whole-slide images (WSI). These models have been shown to achieve good performance on classification tasks, while also offering an inherent proxy for interpretability through attention weights. In this work, we first question the validity of using attention for the interpretability of MIL models. Subsequently, we propose Counterfactual Intervention in Attention for MIL (), a causal extension of attention-based MIL that explicitly measures and optimizes the contribution of attention to slide-level predictions. Across four histopathology classification benchmarks (BRCA, NSCLC, LUAD, Camelyon16) and two feature encoders (Resnet50, UNI), we investigate how the interpretability of attention relates to the representation space, and the downstream performance. We then show that achieves performance comparable to strong MIL baselines while providing a more causally meaningful attention vector for explaining the model’s outcome. Qualitative perturbation experiments show that dropping the top-attended patches leads to a larger confidence degradation in compared to baseline ABMIL, highlighting the potential of causal supervision for reliable and interpretable WSI-based prediction.}
}


@InProceedings{pmlr-v315-bhandary26a,
  title = 	 {Learning Robust Medical Image Segmentation with Inductive Bias},
  author =       {Bhandary, Shrajan and Kuhn, Dejan and Babaiee, Zahra and Fechter, Tobias and Grosu, Anca{-}Ligia and Grosu, Radu},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3355--3373},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/bhandary26a/bhandary26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/bhandary26a.html},
  abstract = 	 {Despite the success of transformer-based and convolutional neural networks in 3D medical image segmentation, current architectures exhibit limited generalisation on small datasets and under distribution shifts, especially when high-quality examples are scarce for specific structures. We introduce IB-nnU-Nets, a family of U-Net variants augmented with inductively biased filters inspired by vertebrate visual processing. Starting from a 3D U-Net backbone, we insert two 3D residual components into the second encoder block that implement on- and off-centre-surround convolutions with fixed, pre-computed weights and act as complementary edge detectors. Across multiple organ and tumour segmentation tasks, we show that equipping state-of-the-art 3D U-Nets with an IB block improves accuracy and robustness, with the strongest gains in small-data and out-of-distribution settings. The framework and trained IB-nnU-Net models are publicly available.}
}


@InProceedings{pmlr-v315-grigorescu26a,
  title = 	 {SuD-CoTAN: Sulcal Depth-guided Anatomically Consistent Fetal Cortical Surface Reconstruction},
  author =       {Grigorescu, Irina and Xiao, Jiaxin and Guo, Yourong and Kyriakopoulou, Vanessa and Uus, Alena and Karolis, Vyacheslav and Liang, Kaili and Suliman, Mohamed A. and Ma, Qiang and Rueckert, Daniel and Kainz, Bernhard and Edwards, A. David and Hajnal, Joseph V. and Rutherford, Mary and Deprez, Maria and Robinson, Emma C.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3374--3396},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/grigorescu26a/grigorescu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/grigorescu26a.html},
  abstract = 	 {Accurate and anatomically consistent fetal cortical surface reconstruction is essential for studying early brain development, yet existing methods often lack reliable vertex-wise correspondence and fail to harmonise their outputs across heterogeneous magnetic resonance imaging (MRI) datasets. We introduce Sulcal Depth-guided CoTAN (SuD-CoTAN), a learning-based framework that fits anatomically and topologically consistent cortical meshes directly to T2-weighted MRI and performs alignment to age-matched templates in one single step. All models are trained exclusively on normative samples from the developing Human Connectome Project (dHCP) and evaluated within-sample and on a different acquisition protocol. Results show that SuD-CoTAN generalises to new datasets in ways that harmonise global morphometric properties by better capturing the surface geometry of individual cases; its template fitting is precise, delivering vertex-wise anatomical correspondences that result in sharp weekly averages of sulcal depth and curvature maps in template space. This supports direct vertex-wise Gaussian Process regression of neurodevelopmental trends without a need for any additional registration. Collectively, this whole pipeline runs in $\sim$3 seconds. This suggests that SuD-CoTAN offers promise as a screening tool for cortical malformations during fetal development.}
}


@InProceedings{pmlr-v315-lux26a,
  title = 	 {Beyond scalar losses: calibrating segmentation models via gradient vector field surgery},
  author =       {Lux, Laurin and Berger, Alexander H. and Knolle, Moritz and R{\"u}ckert, Daniel and Paetzold, Johannes C.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3397--3423},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/lux26a/lux26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/lux26a.html},
  abstract = 	 {Region-based loss functions, such as the Dice loss, have established themselves as the de facto standard for highly class- and region-imbalanced segmentation tasks. However, models trained using region-based loss functions are notoriously miscalibrated and typically yield over-confident predictions. In medical imaging applications, such as defining tumor resection margins, this miscalibration is hindering clinical adoption. In this work, we outline a novel gradient perspective on this overconfidence and show how it affects region-based loss functions. We propose a "surgery" on the gradient vector field as a simple, yet effective intervention to mitigate calibration issues. This surgery adds a factor to the loss’s partial derivative, scaling the gradient’s magnitude linearly with the prediction error. In empirical evaluations across 2D and 3D medical segmentation tasks, we demonstrate the effectiveness of this intervention while maintaining high prediction accuracy when used in conjunction with any region-based loss function.}
}


@InProceedings{pmlr-v315-masui26a,
  title = 	 {CSVR: Combined Surface and Volume Registration for Neonatal Brain MRI},
  author =       {Masui, Saga N.B. and Guo, Yourong and Suliman, Mohamed A. and Heinrich, Mattias P. and Baena, Nashira and Grigorescu, Irina and Williams, Logan Z. J. and Davies, Ashleigh and Kyriakopoulou, Vanessa and McAlonan, Gr{\'a}inne and O'Muircheartaigh, Jonathan and Robinson, Emma C.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3424--3442},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/masui26a/masui26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/masui26a.html},
  abstract = 	 {Nonlinear image registration is a cornerstone of neuroimaging analysis, supporting both qualitative and quantitative comparisons of brain structures across individuals and over time. While traditional volumetric registration methods, driven by voxel intensities, achieve good alignment of subcortical regions, they generally fail to capture correspondences between highly convoluted and variable cortical shapes. Surface-based methods, which instead regularise mappings as geodesics along the cortical sheet, yield improved cortical alignment but ignore the subcortical domain, limiting their utility for whole-brain analyses. A unified registration framework would address these limitations to enable integrated analysis of cortical and subcortical structures and the neuronal fibres that connect them. However, achieving this is challenging, since matching heterogeneous cortical shapes implies large volumetric displacements local to the cortex. To overcome these challenges, we introduce CSVR, the first deep learning-based framework for combined surface–volume registration of neonatal MRI. By integrating hierarchical registration strategies with discrete optimisation, CSVR achieves accurate, smooth, and anatomically plausible alignment of the entire brain.}
}


@InProceedings{pmlr-v315-sens26a,
  title = 	 {GEMCONT: Genetics-based Multimodal Contrastive Learning Enhances Phenotypic embeddings and Boosts Genetic Discovery},
  author =       {Sens, Daniel and Shilova, Liubov and Dalca, Adrian V. and Schnabel, Julia A. and Casale, Francesco Paolo},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3443--3463},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/sens26a/sens26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/sens26a.html},
  abstract = 	 {Genetic variation provides stable, time-invariant markers of disease risk and can therefore reveal upstream mechanisms underlying complex traits. Genome-wide association studies (GWAS) have identified thousands of loci associated with disease, yet most remain difficult to interpret because the intermediate phenotypes linking genotype to disease are unknown. Here, we address the question whether disease-associated genetic loci can be directly used to extract such risk-related features from quantitative phenotypes, including functional tests and medical imaging. We introduce GEMCONT (GEnetics-based Multimodal CONTrastive Learning), a multimodal contrastive learning framework that aligns genotype and phenotype representations in a shared latent space. Unlike task-agnostic multimodal pretraining, GEMCONT is disease-conditioned: GWAS-informed variant panels act as targeted supervision to learn risk-relevant imaging embeddings. To reflect the weak, additive nature of genetic effects, it employs a linear genetic encoder alongside a deep phenotypic encoder. We validate GEMCONT in controlled simulations and apply it to two real-world settings: spirometry curves for asthma and retinal fundus images for glaucoma. In both, GEMCONT improves disease risk prediction and enhances recovery of genetic associations compared with standard unsupervised or polygenic risk–based models. Altogether, our results demonstrate that incorporating stable genetic supervision into multimodal representation learning enables the extraction of genetically informed risk traits, refining disease phenotypes and improving the interpretability of association studies.}
}


@InProceedings{pmlr-v315-archit26a,
  title = 	 {Revisiting foundation models for cell instance segmentation},
  author =       {Archit, Anwai and Pape, Constantin},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3464--3495},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/archit26a/archit26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/archit26a.html},
  abstract = 	 {Cell segmentation is a fundamental task in microscopy image analysis. Several foundation models for cell segmentation have been introduced, virtually all of them are extensions of Segment Anything Model (SAM), improving it for microscopy data. Recently, SAM2 and SAM3 have been published, further improving and extending the capabilities of general-purpose segmentation foundation models. Here, we comprehensively evaluate foundation models for cell segmentation (CellPoseSAM, CellSAM, $\mu$SAM) and for general-purpose segmentation (SAM, SAM2, SAM3) on a diverse set of (light) microscopy datasets, for tasks including cell, nucleus and organoid segmentation. Furthermore, we introduce a new instance segmentation strategy called automatic prompt generation (APG) that can be used to further improve SAM-based microscopy foundation models. APG consistently improves segmentation results for $\mu$SAM, which is used as the base model, and is competitive with the state-of-the-art model CellPoseSAM. Moreover, our work provides important lessons for adaptation strategies of SAM-style models to microscopy and provides a strategy for creating even more powerful microscopy foundation models.}
}


@InProceedings{pmlr-v315-zhang26b,
  title = 	 {RadAgents: Multimodal Agentic Reasoning for Chest X-ray Interpretation with Radiologist-like Workflows},
  author =       {Zhang, Kai and Barrett, Corey D and Kim, Jangwon and Sun, Lichao and Taghavi, Tara and Kenthapadi, Krishnaram},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3496--3519},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zhang26b/zhang26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zhang26b.html},
  abstract = 	 {Agentic systems offer a potential path to solve complex clinical tasks through collaboration among specialized agents, augmented by tool use and external knowledge bases. Nevertheless, for chest X-ray (CXR) interpretation, prevailing methods remain limited: (i) reasoning is frequently neither clinically interpretable nor aligned with guidelines, reflecting mere aggregation of tool outputs; (ii) multimodal evidence is insufficiently fused, yielding text-only rationales that are not visually grounded; and (iii) systems rarely detect or resolve cross-tool inconsistencies and provide no principled verification mechanisms. To bridge the above gaps, we present RadAgents, a multi-agent framework that couples clinical priors with task-aware multimodal reasoning and encodes a radiologist-style workflow into a modular, auditable pipeline. In addition, we integrate grounding and multimodal retrieval-augmentation to verify and resolve context conflicts, resulting in outputs that are more reliable, transparent, and consistent with clinical practice.}
}


@InProceedings{pmlr-v315-nahian26a,
  title = 	 {Domain-Constrained Distillation of DINOv3 into a Lightweight Foundation Model Toward Point-of-Care Ultrasound},
  author =       {Nahian, Md Jaber Al and Ghosh, Shrimanti and Jaremko, Jacob and Hareendranathan, Abhilash},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3520--3541},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/nahian26a/nahian26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/nahian26a.html},
  abstract = 	 {Vision foundation models such as DINOv3 provide powerful representations but are too computationally demanding for point-of-care ultrasound (POCUS), whereas lightweight CNNs remain deployable yet brittle when faced with diverse anatomies and acquisition styles. We bridge this gap with a domain-constrained distillation framework that transfers DINOv3 ViT-B/16 knowledge into a compact ResNet-50, achieving roughly 3.4$\times$ compression while preserving the teacher’s billion-scale visual priors. Using a large, heterogeneous ultrasound corpus and physics-aware augmentations, the distilled model delivers substantial linear-probe improvements over standard CNN baselines and consistently outperforms the ViT teacher on challenging, heterogeneous datasets. It further offers marked gains in limited-label regimes, reflecting the realities of POCUS workflows where annotated data are scarce. Embedding visualizations show that the distilled encoder forms clearer, anatomy-aware clusters than the teacher, indicating successful alignment to ultrasound structure. Together, these results demonstrate that large-scale natural-image priors can be distilled into a lightweight, generalizable encoder suitable for resource-constrained clinical deployment.}
}


@InProceedings{pmlr-v315-hagen26a,
  title = 	 {Task-Conditioned 3D U-Nets via Hypernetworks for Data-Scarce Medical Segmentation},
  author =       {Hagen, Luca and M{\"u}ller, Johanna P. and Gmeiner, Moritz and Kainz, Bernhard},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3542--3560},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/hagen26a/hagen26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/hagen26a.html},
  abstract = 	 {Training 3D segmentation models typically requires extensive expert annotation, which is costly and often unavailable for rare or low-prevalence pathologies. We propose a hypernetwork-based framework that amortises the prediction of parameters for compact 3D U-Nets, enabling task-specific specialisation from as little as a single annotated volume. By learning shared anatomical structure, such as coarse shape, scale, and spatial organisation, across organs and imaging modalities, the hypernetwork generates task-conditioned network parameters, allowing controlled adaptation to previously unseen but anatomically related targets without full retraining. We evaluate the proposed approach on the CT TotalSegmentator and Medical Segmentation Decathlon benchmarks. The method achieves strong one-shot performance for anatomically homogeneous structures (e.g., liver, spleen, atrium) and demonstrates stable few-shot adaptation for more heterogeneous or low-contrast targets (e.g., tumours, prostate). In regimes with two to four annotated volumes, hypernetwork-generated U-Nets consistently outperform pretrained baselines and substantially reduce the performance gap to fully supervised models while using minimal annotation. These results indicate that weight prediction serves as an effective task-informed prior for data-scarce 3D medical image segmentation.}
}


@InProceedings{pmlr-v315-eppink26a,
  title = 	 {PaSAL: A Deep Learning Pipeline for Pulmonary Artery-Vein Segmentation and Anatomical Labeling in Thoracic CT},
  author =       {Eppink, Jasper and Kervadec, Hoel and van Capelleveen, Julian and Verhoeff, Joost and Senan, Suresh and Bohoudi, Omar},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3561--3592},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/eppink26a/eppink26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/eppink26a.html},
  abstract = 	 {We present PaSAL, a deep learning pipeline for pulmonary artery-vein segmentation and anatomical labeling in thoracic CT. PaSAL combines an nnU-Net-based binary vessel segmentation model with a graph-based anatomical labeling framework that assigns 19 clinically defined vascular classes. The pipeline integrates vessel enhancement, skeletonization, and topology-aware label propagation to produce anatomically coherent outputs. PaSAL is trained on the HiPaS and PTL public datasets and evaluated on an external set of 63 clinical scans from Amsterdam UMC. On HiPaS, PaSAL achieves Dice scores of $89.5%$ (arteries) and $88.1%$ (veins). On PTL, voxel-level anatomical labeling accuracy reaches $90.1%$ for arteries and $82.7%$ for veins. Expert review confirms high anatomical plausibility and clinical utility, while showing weak correlation between standard quantitative metrics and perceived quality. To our knowledge, PaSAL is the first method to jointly perform artery-vein segmentation and anatomical labeling in CT. The results demonstrate robust performance across diverse anatomical presentations, including pre- and post-radiotherapy scans, and establish PaSAL as a useful baseline tool for vascular analysis in medical imaging.}
}


@InProceedings{pmlr-v315-shahid26a,
  title = 	 {Probabilistic Feature Imputation and Uncertainty-Aware Multimodal Federated Aggregation},
  author =       {Shahid, Nafis Fuad and Ahmed, Maroof and Haider, Md Akib and Sagor, Saidur Rahman and Rahman, Aashnan and Hossain, Md Azam},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3593--3607},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/shahid26a/shahid26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/shahid26a.html},
  abstract = 	 {Multimodal federated learning enables privacy-preserving collaborative model training healthcare applications. However, a fundamental challenge arises from modality heterogeneity: many clinical sites possess only a subset of modalities due to resource constraints or workflow variations. Existing approaches address this through feature imputation networks that synthesize missing modality representations, yet these methods produce point estimates without reliability measures, forcing downstream classifiers to treat all imputed features as equally trustworthy. In safety-critical medical applications, this limitation poses significant risks. We propose the Probabilistic Feature Imputation Network (P-FIN), which outputs calibrated uncertainty estimates alongside imputed features. This uncertainty is leveraged at two levels: (1) locally, through sigmoid gating that attenuates unreliable feature dimensions before classification, and (2) globally, through Fed-UQ-Avg, an aggregation strategy that prioritizes updates from clients with reliable imputation. Experiments on federated chest X-ray classification using CheXpert, NIH Open-I, and PadChest demonstrate consistent improvements over deterministic baselines, with +5.36% AUC gain in the most challenging configuration.}
}


@InProceedings{pmlr-v315-collin26a,
  title = 	 {ASTIH: A collection of axon and myelin segmentation datasets from multiple histology studies},
  author =       {Collin, Armand and Boudreau, Mathieu and Cohen-Adad, Julien},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3608--3627},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/collin26a/collin26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/collin26a.html},
  abstract = 	 {Large-scale analysis of axon and myelin morphometry in nervous tissues is fundamental to neuroscience research, yet manual quantification remains a profound bottleneck, limiting the scale and efficiency of studies. To address this, we introduce the Axon Segmentation Training Initiative for Histology (ASTIH), a publicly accessible resource designed to propel the development and validation of automated histomorphometry tools. ASTIH comprises five meticulously curated datasets, standardized for machine learning applications, featuring over 69,000 manually segmented axon fibers. These datasets exhibit significant diversity, spanning three microscopy modalities (TEM, SEM, bright-field), three species (mouse, rat, rabbit), and three distinct anatomical regions (brain, spinal cord, peripheral nerves) with varying pixel resolutions (from 0.2 to 0.002 $\mu m/px$). All datasets contain detailed annotations with standardized boundary delineation between adjacent fibers, enabling effective use for both semantic and instance segmentation tasks. We also provide thoroughly evaluated baseline segmentation models for every dataset in the collection to facilitate future benchmarking.}
}


@InProceedings{pmlr-v315-spears26a,
  title = 	 {EPI Distortion Correction without Opposite Phase Encodings with Unsupervised INR-Based Deformable Registration},
  author =       {Spears, Tyler and Goldman, Myla and Fletcher, P. Thomas},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3628--3640},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/spears26a/spears26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/spears26a.html},
  abstract = 	 {Diffusion MRIs (dMRIs) provide a detailed look at the structure of the brain, but the acquired images come with many distortions. Echo planar imaging (EPI) sequences, nearly universal for dMRIs, are highly sensitive to inhomogeneities of the magnetic field in the scanner. This results in severe geometric distortion (up to tens of millimeters) in the phase encoding direction, particularly in areas with strong changes in tissue density such as the brainstem, temporal, and frontal regions. A common method for correcting EPI distortion is to collect an image with the opposite phase encoding (PE) direction and reconstruct the magnetic susceptibility field. However, many dMRI protocols, some still in use today, do not include this auxiliary acquisition. Other methods have attempted to register the distorted EPI to an anatomical reference, with less accurate results. In this work, we propose EPINR, an unsupervised implicit neural representation (INR) based registration model that builds on these previous works. EPINR learns the susceptibility field by warping a single b0 image to a T1w reference, without opposite PE acquisitions. EPINR also leverages its smooth and continuous representation to apply higher-order regularizations calculated analytically. We evaluate EPINR against several comparison methods, both traditional and learning-based, over two dMRI datasets. We perform further ablation analyses on the effect of different components in EPINR. Finally, we discuss the reasons for EPINR’s high performance, and how it can bring structural precision to previously compromised diffusion images.}
}


@InProceedings{pmlr-v315-wiers26a,
  title = 	 {Geometry-Aware Cardiac MRI Representation Learning with Equivariant Neural Fields},
  author =       {Wiers, Jesse L. and Wessels, David R. and Arts, Lukas P.A. and Ruiperez-Campillo, Samuel and Kolk, Maarten Z.H. and Tjong, Fleur V.Y. and Bekkers, Erik J.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3641--3658},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wiers26a/wiers26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wiers26a.html},
  abstract = 	 {Cardiac MRI encodes detailed geometric information, but standard deep learning models rely on grid-based encoders that emphasize texture rather than structure. Neural fields offer a continuous alternative, yet Conditional Neural Fields (CNFs) compress each subject into a single global latent, discarding spatial organization. We evaluate Equivariant Neural Fields (ENFs) for cardiac MRI, which replace the global latent with a geometry-aware latent point cloud. ENFs achieve competitive reconstruction quality with far fewer decoder parameters and produce latents that are local, anatomically meaningful, and robust to geometric transformations. For downstream prediction tasks, ENF latents perform competitively with ResNet50 and global CNF latents across several clinical endpoints. These results position ENFs as a compact, interpretable, and geometry-aware alternative for cardiac MRI representation learning.}
}


@InProceedings{pmlr-v315-abouyoussef26a,
  title = 	 {Validating the Benefit of Combining Imaging and Clinical Data for Ischemic Stroke Outcome Prediction},
  author =       {Abouyoussef, Zeyad and Ospel, Johanna and Souza, Roberto},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3659--3678},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/abouyoussef26a/abouyoussef26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/abouyoussef26a.html},
  abstract = 	 {Endovascular treatment (EVT) has been proven to be a successful treatment for some cases of acute ischemic stroke. However, neuro-radiologists rely on a small set of clinical features to select patients for treatment. This leads to the exclusion of patients who would have benefited from treatment and the inclusion of others who would not have benefited from it. Deep learning has been used to predict stroke outcome from baseline imaging and clinical data, with most studies reporting that combining imaging and clinical data slightly outperforms classical methods (e.g., logistic regression) trained on clinical data only. However, it is not clear how much of this improvement is attributed to the imaging data and whether it is robust to larger and more diverse test sets. We use one of the largest multi-center acute ischemic stroke datasets ($n = 1,105$) to determine whether combining imaging and clinical data outperforms classical methods. We show that combining imaging and clinical data matches the performance of logistic regression ($0.72$ Area Under Receiver Operating Characteristic Curve (AUROC)) when evaluated on a multi-center test set of over $600$ samples. We examine the models’ predictions and weights and find that 1) both methods match each other’s prediction for $78%$ of the samples, and 2) the weights associated with the imaging features are small compared to the clinical ones. This suggests that imaging features extracted from the deep learning model do not contribute to the prediction as much as the clinical ones.}
}


@InProceedings{pmlr-v315-zhou26a,
  title = 	 {TTT-UNet: Enhancing U-Net with Test-Time Training Layers for Biomedical Image Segmentation},
  author =       {Zhou, Rong and Yuan, Zhengqing and Yan, Zhiling and Sun, Weixiang and Zhang, Kai and Li, Yiwei and Ye, Yanfang and Li, Xiang and Sun, Lichao and He, Lifang},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3679--3703},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zhou26a/zhou26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zhou26a.html},
  abstract = 	 {Biomedical image segmentation is crucial for accurately diagnosing and analyzing various diseases. However, Convolutional Neural Networks (CNNs) and Transformers, the most commonly used architectures for this task, struggle to effectively capture long-range dependencies due to the inherent locality of CNNs and the computational complexity of Transformers. To address this limitation, we introduce TTT-UNet, a novel framework that integrates Test-Time Training (TTT) layers into the traditional U-Net architecture for biomedical image segmentation. TTT-UNet dynamically adjusts model parameters during the test time, enhancing the model’s ability to capture both local and long-range features. We evaluate TTT-UNet on multiple medical imaging datasets, including 3D abdominal organ segmentation in CT and MR images, instrument segmentation in endoscopy images, and cell segmentation in microscopy images. The results demonstrate that TTT-UNet consistently outperforms state-of-the-art CNN-based and Transformer-based segmentation models across all tasks. The code is available at }
}


@InProceedings{pmlr-v315-su26a,
  title = 	 {AdvDINO: Domain-Adversarial Self-Supervised Representation Learning for Spatial Proteomics},
  author =       {Su, Stella and Harary, Marc and Rodig, Scott J. and Lotter, William},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3704--3722},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/su26a/su26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/su26a.html},
  abstract = 	 {Self-supervised learning (SSL) has emerged as a powerful approach for learning visual representations without manual annotations. However, the robustness of standard SSL methods to domain shift—systematic differences across data sources—remains uncertain, posing an especially critical challenge in biomedical imaging where batch effects can obscure true biological signals. We present AdvDINO, a domain-adversarial SSL framework that integrates a gradient reversal layer into the DINOv2 architecture to promote domain-invariant feature learning. Applied to a real-world cohort of six-channel multiplex immunofluorescence (mIF) whole slide images from lung cancer patients, AdvDINO mitigates slide-specific biases to learn more robust and biologically meaningful representations than non-adversarial baselines. Across more than 5.46 million mIF image tiles, the model uncovers phenotype clusters with differing proteomic profiles and prognostic significance, and enables strong survival prediction performance via attention-based multiple instance learning. The improved robustness also extends to a breast cancer cohort. While demonstrated on mIF data, AdvDINO is broadly applicable to other medical imaging domains, where domain shift is a common challenge.}
}


@InProceedings{pmlr-v315-zheng26b,
  title = 	 {SAC-Diff: A Scan-Aware Consistency-Enhanced Diffusion Framework for Unsupervised Chest CT Anomaly Detection},
  author =       {Zheng, Xinyuan and Shinagawa, Yoshihisa and Farhand, Sepehr and Liu, Chi and Valadez, Gerardo Hermosillo and Guo, Xueqi},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3723--3749},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zheng26b/zheng26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zheng26b.html},
  abstract = 	 {Anomaly detection in medical imaging is important but challenging due to diverse and imbalanced pathologies. Supervised methods rely on large annotated datasets and generalize poorly to unseen conditions. Unsupervised generative methods, especially diffusion models, can learn normal anatomy and detect outliers, but often hallucinate because of the Gaussian noise design and insufficient anatomical guidance. To address these challenges, we propose SAC-Diff, a Scan-Aware Consistency-Enhanced Diffusion framework for unsupervised anomaly detection in automated lung disease screening using chest CT. SAC-Diff adopts simplex noise for detail-preserving diffusion perturbation, integrates scan awareness via (A) subject-aware anatomical priors into conditional diffusion and (B) background-aware masking for scan-specific variations and heterogeneous lung anomalies, and enhances robustness by enforcing consistency and quantifying uncertainty through multi-sample ensembling. We evaluate SAC-Diff on two diseased datasets with various anomalies, COVID-19 and interstitial lung disease (ILD), and observe substantial improvements over prior methods. On COVID-19, SAC-Diff achieves an IoU of 0.39 (+3.75% improvement compared to existing methods) and Dice of 0.53 (+2.99%); on ILD, it improves IoU to 0.31 (+74.45%) and Dice to 0.44 (+60.40%). Our results demonstrate promise toward robust and annotation-free CT anomaly detection in hospital deployment.}
}


@InProceedings{pmlr-v315-wang26g,
  title = 	 {Detector-in-the-Loop Tracking: Active Memory Rectification for Stable Glottic Opening Localization},
  author =       {Wang, Huayu and Alattar, Bahaa and Yang, Cheng-Yen and Huang, Hsiang-Wei and Kim, Jung Heon and Shapiro, Linda and White, Nathan and Hwang, Jenq-Neng},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3750--3763},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wang26g/wang26g.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wang26g.html},
  abstract = 	 {Temporal stability in glottic opening localization remains challenging due to the complementary weaknesses of single-frame detectors and foundation-model trackers: the former lacks temporal context, while the latter suffers from memory drift. Specifically, in video laryngoscopy, rapid tissue deformation, occlusions, and visual ambiguities in emergency settings require a robust, temporally aware solution that can prevent progressive tracking errors. We propose Closed-Loop Memory Correction (CL-MC), a detector-in-the-loop framework that supervises Segment Anything Model 2(SAM2) through confidence-aligned state decisions and active memory rectification. High-confidence detections trigger semantic resets that overwrite corrupted tracker memory, effectively mitigating drift accumulation with a training-free foundation tracker in complex endoscopic scenes. On emergency intubation videos, CL-MC achieves state-of-the-art performance, significantly reducing drift and missing rate compared with the SAM2 variants and open loop based methods. Our results establish memory correction as a crucial component for reliable clinical video tracking.}
}


@InProceedings{pmlr-v315-zhang26c,
  title = 	 {Unlocking 2D Promptable Foundation Models for 3D Vessel Segmentation by Automatic Prompt Generation},
  author =       {Zhang, Ziyu and Yu, Yi and Xue, Yuan},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3764--3778},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zhang26c/zhang26c.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zhang26c.html},
  abstract = 	 {3D vessel segmentation is a core task in medical image analysis, playing a crucial role in disease diagnosis and surgical planning. While fully supervised 3D segmentation methods rely on costly high-quality annotations, promptable models (e.g., ScribblePrompt) provide a promising alternative with their zero-shot generalization capability for efficient 3D segmentation. Nevertheless, when directly applied to 3D tasks, these 2D methods require slice-wise prompts, disregarding the continuity of 3D structures and leading to low efficiency. To address this issue, we propose an innovative method based on automatic prompt generation, which integrates with pre-trained 2D interactive models to achieve efficient 3D vessel segmentation. By leveraging spatial continuity and contextual information, our method automatically generates prompts across the entire 3D volume from a single user-provided prompt. Experiments conducted on public and in-house vessel datasets demonstrate the effectiveness of the proposed method, showing that it achieves segmentation accuracy comparable to or better than state-of-the-art models, while significantly reducing the interaction cost.}
}


@InProceedings{pmlr-v315-tur26a,
  title = 	 {WFM: 3D Wavelet Flow Matching for Ultrafast Multi-Modal MRI Synthesis},
  author =       {Tur, Yalcin and Stojkovic, Mihajlo and Bagci, Ulas},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3779--3796},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/tur26a/tur26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/tur26a.html},
  abstract = 	 {Diffusion models have achieved remarkable quality in multi-modal MRI synthesis, but their computational cost (hundreds of sampling steps and separate models per modality) limits clinical deployment. We observe that this inefficiency stems from an unnecessary starting point: diffusion begins from pure noise, discarding the structural information already present in available MRI sequences. We propose WFM (Wavelet Flow Matching), which instead learns a direct flow from an informed prior, the mean of conditioning modalities in wavelet space, to the target distribution. Because the source and target share underlying anatomy and differ primarily in contrast, this formulation enables accurate synthesis in just 1-2 integration steps. A single 82M-parameter model with class conditioning synthesizes all four BraTS modalities (T1, T1c, T2, FLAIR), replacing four separate diffusion models totaling 326M parameters. On BraTS 2024, WFM achieves 26.8 dB PSNR and 0.94 SSIM, within 1-2 dB of diffusion baselines, while running 250-1000x faster (0.16-0.64s vs. 160s per volume). This speed-quality trade-off makes real-time MRI synthesis practical for clinical workflows.}
}


@InProceedings{pmlr-v315-roy26a,
  title = 	 {Is Exchangeability better than I.I.D. to handle Data Distribution Shifts while Pooling Data for Data-scarce Medical image segmentation?},
  author =       {Roy, Ayush and Enam, Samin and Xia, Jun and Kim, Won Hwa and Lokhande, Vishnu Suresh},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3797--3826},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/roy26a/roy26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/roy26a.html},
  abstract = 	 {Data scarcity is a major challenge in medical imaging, particularly for deep learning models. While data pooling (combining datasets from multiple sources) and data addition (adding more data from a new dataset) have been shown to enhance model performance, they are not without complications. Specifically, increasing the size of the training dataset through pooling or addition can induce distributional shifts, negatively affecting downstream model performance, a phenomenon known as the “Data Addition Dilemma”. While the traditional i.i.d. assumption may not hold in multi-source contexts, assuming exchangeability across datasets provides a more practical framework for data pooling. In this work, we investigate medical image segmentation under these conditions, drawing insights from causal frameworks to propose a method for controlling foreground-background feature discrepancies across all layers of deep networks. This approach improves feature representations, which are crucial in data-addition scenarios. Our method achieves state-of-the-art segmentation performance on histopathology and ultrasound images across five datasets, including a novel ultrasound dataset that we have curated and contributed. Qualitative results demonstrate more refined and accurate segmentation maps compared to prominent baselines across three model architectures.}
}


@InProceedings{pmlr-v315-zhong26a,
  title = 	 {Orientation-Aware Diffusion Super-Resolution for 3T-Like Fetal MRI from Routine 1.5T Scans},
  author =       {Zhong, Xinliu and Liu, Ruiying and Lin, Guohao and Huang, Chuan and Goldman-Yassen, Adam Ezra and Mehollin-Ray, Amy Robben and Wang, Yun},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3827--3845},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zhong26a/zhong26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zhong26a.html},
  abstract = 	 {Fetal MRI plays a central role in assessing early brain development. While 3T scanners offer higher SNR and improved cortical detail, their increased sensitivity to motion, susceptibility artifacts, and $B_1$ inhomogeneity limits wide adoption for routine fetal imaging. Consequently, most clinical examinations are performed at 1.5T, where greater motion tolerance comes at the cost of lower SNR, reduced gray-white matter contrast, and partial-volume blurring - factors that undermine downstream morphometric analysis. Bridging this quality gap without sacrificing motion robustness of 1.5T would enable 3T-like morphometric reliability in routine clinical acquisitions. We propose an orientation-aware diffusion super-resolution framework that synthesizes 3T-like fetal brain contrast from routine 1.5T scans. The model combines a Swin-UNet backbone with gated FiLM-based orientation embeddings and a residual error-shifting diffusion mechanism. Training leverages the FaBiAN phantom to generate controllable high-/low-resolution pairs with monotonic intensity remapping, geometric perturbations, and simulated signal voids, thereby ensuring generalization to clinical data. Our model produces markedly sharper gyri and mitigates partial-volume effects in both synthesized and clinical data. When evaluated using Fetal-SynthSeg following NeSVoR reconstruction, the framework consistently improves tissue segmentation accuracy over state-of-the-art restoration baselines, yielding more reliable morphometric estimates for fetal brain analysis.}
}


@InProceedings{pmlr-v315-rahman26b,
  title = 	 {DTC-WSI: Dynamic Token Compression for Whole-Slide Images},
  author =       {Rahman, Tawsifur and Tarkhan, Aliasghar and Chellappa, Rama and Baras, Alexander S.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3846--3865},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/rahman26b/rahman26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/rahman26b.html},
  abstract = 	 {Whole-slide images (WSIs) contain tens of thousands of heterogeneous patches, making transformer-based multiple-instance learning (MIL) computationally expensive due to quadratic attention costs and substantial redundancy in tissue morphology. Existing token-reduction approaches for WSI analysis rely primarily on pruning, which discards information early in training and destabilizes optimization under weak supervision. We propose Dynamic Token Compression for Whole-Slide Images (DTC-WSI), a token-efficient MIL framework that performs progressive, importance-aware WSI compression. DTC-WSI integrates a lightweight saliency network with a multi-stage token compressor that combines bipartite similarity matching and soft differentiable pruning to gradually eliminate redundant or non-diagnostic patches. During training, soft gates enable stable gradient flow, while inference employs deterministic compression for substantial acceleration. This curriculum-style compression preserves discriminative morphology and dramatically reduces computational burden. Across four WSI benchmarks (TCGA-NSCLC, TCGA-BRCA, TCGA-RCC, PANDA), DTC-WSI achieves 5–10$\times$ token reduction, up to 5.3$\times$ faster inference, and 20–40% lower memory usage, while improving MIL classification accuracy by 2–4% over state-of-the-art baselines. Our results demonstrate that dynamic token compression is a powerful and scalable alternative to pruning, enabling efficient transformer-based WSI analysis while improving accuracy.}
}


@InProceedings{pmlr-v315-zhang26d,
  title = 	 {Quantifying and Mitigating Hospital Domain Bias in Pathology Foundation Models using Adversarial Feature Disentanglement},
  author =       {Zhang, Mengliang},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3866--3884},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zhang26d/zhang26d.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zhang26d.html},
  abstract = 	 {Pathology foundation models (PFMs) have demonstrated remarkable potential in whole-slide image (WSI) diagnosis. However, pathology images from different hospitals exhibit domain shifts due to variations in scanning hardware and preprocessing. These differences cause PFMs to learn spurious hospital-specific features, severely compromising their robustness and generalizability in clinical settings. We present the first systematic study of this hospital-source domain bias in PFMs. To address the critical trade-off between diagnostic utility and domain predictability, we establish a quantification pipeline and introduce the Robustness Index (RI). Furthermore, we propose a lightweight adversarial framework for feature disentanglement. This framework employs a trainable adapter and a domain classifier connected via a Gradient Reversal Layer (GRL) to remove latent hospital-specific information from frozen PFM representations without modifying the encoder itself. Experiments on multi-center histopathology datasets demonstrate that our approach substantially suppresses domain predictability and achieves significant gains in feature robustness. Crucially, the method maintains or improves disease classification performance, proving its efficacy particularly in out-of-domain scenarios.}
}


@InProceedings{pmlr-v315-minh26a,
  title = 	 {MGMT Promoter Methylation Prediction in Glioblastoma Using 3D CNNs with Advanced MRI Sequences},
  author =       {Minh, Tran Nguyen Tuan and Kha, Quang Hien and Le, Viet Huan and Chua, Matthew Chin Heng and Le, Nguyen Quoc Khanh},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3885--3898},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/minh26a/minh26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/minh26a.html},
  abstract = 	 {Accurate determination of O6-methylguanine-DNA methyltransferase (MGMT) promoter methylation status is essential for therapeutic planning in glioblastoma (GBM). Although molecular assays remain the reference standard, they are costly, invasive, and not always feasible in routine practice. This has motivated the development of non-invasive MRI-based deep learning approaches, particularly those leveraging advanced physiological imaging sequences. In this study, we investigated whether arterial spin labeling (ASL) and apparent diffusion coefficient (ADC) imaging provide complementary information for predicting MGMT methylation status in IDH-wildtype GBM. We analyzed 351 patients from the UCSF Preoperative Diffuse Glioma MRI dataset and trained 3D convolutional neural network models based on a ResNet-10 architecture using ASL, ADC, diffusion-weighted imaging (DWI), and conventional T2-FLAIR sequences. Among single-sequence models, ASL achieved the highest performance (accuracy of 0.76, precision of 0.75, and F1 score of 0.73). A dual-sequence model combining ASL and ADC further improved prediction, yielding an AUC of 0.83, significantly outperforming both the ASL-only model and the T2-FLAIR model (AUC 0.6524; DeLong test, $p<0.05$). These results demonstrate that integrating perfusion- and diffusion-based MRI captures complementary physiological characteristics relevant to MGMT methylation, offering a more accurate and fully non-invasive alternative for biomarker assessment. Incorporating advanced MRI sequences into deep learning pipelines may support more informed treatment planning and improve clinical decision-making for patients with GBM.}
}


@InProceedings{pmlr-v315-goyal26a,
  title = 	 {Generating Post-Acetazolamide Cerebral Blood Flow MRI for High-Risk Stroke Patients},
  author =       {Goyal, Rydham and Gonzalez, Camila and Alexander, Sasha and Zou, Aja and Moseley, Michael E and Zhao, Moss Y and Steinberg, Gary K},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3899--3910},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/goyal26a/goyal26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/goyal26a.html},
  abstract = 	 {Cerebrovascular reserve (CVR) quantifies the brain’s ability to augment cerebral blood flow in response to a vasodilatory stimulus. It is a key biomarker in Moyamoya disease and other steno-occlusive cerebrovascular disorders. Clinically, CVR is typically assessed by administering acetazolamide (ACZ) and acquiring post-ACZ perfusion maps, but this workflow is time-consuming, costly, and contraindicated in a subset of patients. In this work, we investigate whether deep learning can predict post-ACZ perfusion directly from baseline arterial spin labeling (ASL) MRI, enabling pharmacologic-free CVR estimation. We curate a single-center dataset of Moyamoya ASL perfusion imaging, comprising pre/post-ACZ scan pairs from 194 patients. We design a post-ACZ conditional Autoencoder (cAE) network to regress the middle axial post-ACZ slice from the corresponding pre-ACZ slice using a combined L1 and SSIM loss. We evaluate our method against three diffusion-based formulations (conditional DDPM, Cold Diffusion, and Residual Diffusion). On a holdout test set of 49 patients, the proposed post-ACZ cAE achieves the highest reconstruction fidelity (SSIM $\approx$ 0.79), outperforming diffusion-based baselines in MAE, SSIM, and PSNR. Region-wise analysis of CBF percentage change in affected versus healthy MCA territories showed that the generated post-ACZ model outputs followed ground truth patterns of cerebrovascular reserve. Our findings demonstrate the feasibility of non-invasive CVR assessment using MRI for high-risk patients. Our data-driven approach could reduce reliance on ACZ challenges in routine clinical workflow and expand access to CVR testing to evaluate brain health.}
}


@InProceedings{pmlr-v315-lin26c,
  title = 	 {ResGAT: A Residual Graph Attention Network for Cancer Subtype Classification in Whole Slide Images},
  author =       {Lin, Zhenhan and Tong, Hao and Hu, Yunfei and Gui, Xianyong and Shen, Jeanne and Lee, Byrne and Zhang, Lu and Moyer, Daniel and Zhou, Mu and Zhou, Xin Maizie and Votanopoulos, Konstantinos},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3911--3930},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/lin26c/lin26c.pdf},
  url = 	 {https://proceedings.mlr.press/v315/lin26c.html},
  abstract = 	 {Multiple instance learning (MIL) provides a weakly supervised framework for whole slide image (WSI) classification, enabling slide-level prediction from gigapixel images with only slide-level labels. However, WSI subtype classification in realistic settings is still challenging. In this work, we propose ResGAT, a residual graph attention framework that operates on hybrid $k$-NN patch graphs and models WSI representations with stacked residual graph attention blocks. ResGAT is evaluated on the subtype classification task across a rare, class-imbalanced appendiceal cancer cohort, BRACS and two TCGA datasets. It outperforms SOTA MIL baselines on the appendiceal cancer and BRACS cohorts, and remains competitive on the TCGA datasets. On the appendiceal cancer cohort, we further assess cross-site generalization via few-shot adaptation under source shift, showing that ResGAT adapts effectively to new domains with limited labels. An ablation study is provided to validate the effectiveness of key architectural components of our method.}
}


@InProceedings{pmlr-v315-sabih26a,
  title = 	 {GraphULM: A Multi-Resolution CNN and GCN Framework for Ultrasound Localization Microscopy},
  author =       {Sabih, Mohammad and Almekkawy, Mohamed Khaled},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3931--3946},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/sabih26a/sabih26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/sabih26a.html},
  abstract = 	 {Ultrasound Localization Microscopy (ULM) is a prominent technique in medical imaging, widely applied to enhance super-resolution, particularly in in-vivo settings. The process of localization, followed by tracking of microbubble (MB), poses a significant challenge in ULM due to its intricacy and complexity. High MB densities intensify these challenges, thereby diminishing the performance of traditional methods and certain deep learning algorithms in achieving precise localization. We present GraphULM, a novel and computationally efficient architecture that combines a Multi-Resolution Convolutional Neural Network (MRCNN) with a Graph Convolutional Network (GCN) to enhance localization efficacy in ULM. To develop an optimal training dataset, synthetically generated data is pre-combined with in-vivo b-mode samples, which improves feature diversity and generalization. Experimental evaluations in in-vivo demonstrate the model’s high performance, reporting a localization precision of 21.9 m, and a Jaccard index of 0.75, at a MB density of 2 MB/mm2, underscoring the model’s robustness. Additionally, our Frequency Ring Correlation (FRC) analysis reveals a remarkable resolution of 5.62 m. The model operates at three times the speed of traditional pipelines, establishing its suitability for rapid ULM applications.}
}


@InProceedings{pmlr-v315-huang26b,
  title = 	 {BrainATCL: Adaptive Temporal Brain Connectivity Learning for Functional Link Prediction and Age Estimation},
  author =       {Huang, Yiran and Nouranizadeh, Amirhossein and Ahrends, Christine and Xu, Mengjia},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3947--3970},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/huang26b/huang26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/huang26b.html},
  abstract = 	 {Functional Magnetic Resonance Imaging (fMRI) is an imaging technique widely used to study human brain activity. fMRI signals in areas across the brain transiently synchronise and desynchronise their activity in a highly structured manner, even when an individual is at rest. These functional connectivity dynamics may be related to behaviour and neuropsychiatric disease. To model these dynamics, temporal brain connectivity representations are essential, as they reflect evolving interactions between brain regions and provide insight into transient neural states and network reconfigurations. However, conventional graph neural networks (GNNs) often struggle to capture long-range temporal dependencies in dynamic fMRI data. To address this challenge, we propose { BrainATCL}, an unsupervised, nonparametric framework for adaptive temporal brain connectivity learning, enabling functional link prediction and age estimation. Our method dynamically adjusts the lookback window for each snapshot based on the rate of newly added edges. Graph sequences are subsequently encoded using a GINE-Mamba2 backbone to learn spatial-temporal representations of dynamic functional connectivity in resting-state fMRI data of 1,000 participants from the Human Connectome Project. To further improve spatial modeling, we incorporate brain structure and function-informed edge attributes, i.e., the left/right hemispheric identity and subnetwork membership of brain regions, enabling the model to capture biologically meaningful topological patterns. We evaluate our BrainATCL on two tasks: functional link prediction and age estimation. The experimental results demonstrate superior performance and strong generalization, including in cross-session prediction scenarios.}
}


@InProceedings{pmlr-v315-guo26a,
  title = 	 {Expert Branches: Module Diversity for Stronger Feature Learning in Laparoscopic Segmentation},
  author =       {Guo, Lin and Camerota, Chiara and Mahmoud, Mohammad and Esposito, Flavio},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3971--3985},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/guo26a/guo26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/guo26a.html},
  abstract = 	 {Module diversity fundamentally enhances a model’s ability to learn geometric structure by enabling a broader and more expressive set of feature representations. While many architectures improve performance by scaling parameters or relying on large-scale pretraining, these strategies make it difficult to identify which design principles truly enhance feature learning capability, especially in challenging domains with limited data such as laparoscopic surgical segmentation. This work investigates a parameter-constrained, no-pretraining setting to isolate the intrinsic feature learning capability of different module configurations. We introduce expert branches, a design concept that assigns different module families to their own independent pathways rather than mixing all features within a single stream. This separation encourages branch-specific specialization (Experts), reduces parameters, and avoids the entanglement that commonly obscures each module’s contribution. We test this idea with TriEB, a UNet-based model incorporating CNN, deformable-convolution, and dynamic-snake branches with less total parameters. TriEB surpasses the vanilla UNet, the non-diverse TriCNN counterpart, and transformer-based models including SegFormer and Swin on the DSAD laparoscopic dataset. These results demonstrate that expert branches offer a more effective design principle for extracting diverse features from surgical imagery. The study highlights module diversity as a promising, architecture-agnostic framework for building efficient, interpretable, and data-adaptive feature extractors.}
}


@InProceedings{pmlr-v315-singh26a,
  title = 	 {A Diffusion-Driven Fine-Grained Nodule Synthesis Framework for Enhanced Lung Nodule Detection from Chest Radiographs},
  author =       {Singh, Shreshtha and Goyal, Aryan and Mittal, Ashish and Tadepalli, Manoj and Kumar, Piyush and Putha, Preetham},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {3986--4009},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/singh26a/singh26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/singh26a.html},
  abstract = 	 {Early detection of lung cancer in chest radiographs (CXRs) is crucial for improving patient outcomes, yet nodule detection remains challenging due to their subtle appearance and variability in radiological characteristics like size, texture, and boundary. For robust analysis, this diversity must be well represented in training datasets for deep learning based Computer-Assisted Diagnosis (CAD) systems. However, assembling such datasets is costly and often impractical, motivating the need for realistic synthetic data generation. Existing methods lack fine-grained control over synthetic nodule generation, limiting their utility in addressing data scarcity. This paper proposes a novel diffusion-based framework with low-rank adaptation (LoRA) adapters for characteristic controlled nodule synthesis on CXRs. We begin by addressing size and shape control through nodule mask conditioned training of the base diffusion model. To achieve individual characteristic control, we train separate LoRA modules, each dedicated to a specific radiological feature. However, since nodules rarely exhibit isolated characteristics, effective multi-characteristic control requires a balanced integration of features. We address this by leveraging the dynamic composability of LoRAs and revisiting existing merging strategies. Building on this, we identify two key issues: overlapping attention regions and non-orthogonal parameter spaces. To overcome these limitations, we introduce a novel orthogonality loss term during LoRA composition training. Extensive experiments on both in-house and public datasets demonstrate improved downstream nodule detection. Radiologist evaluations confirm the fine-grained controllability of our generated nodules, and across multiple quantitative metrics, our method surpasses existing nodule generation approaches for CXRs.}
}


@InProceedings{pmlr-v315-lu26b,
  title = 	 {HiPro-CT: A Hierarchical Probabilistic Framework for 3D Medical Vision-Language Alignment},
  author =       {Lu, Lin and Liu, Zihan and Tang, Chaoxiang and Zhang, Hui},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4010--4025},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/lu26b/lu26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/lu26b.html},
  abstract = 	 {The adaptation of vision-language models (VLMs) to 3D medical imaging is currently impeded by two fundamental bottlenecks: the dilution of local features caused by the granularity mismatch between volumetric data and textual reports, and the inability of deterministic embeddings to capture the inherent semantic uncertainty of clinical descriptions. To address these challenges, we propose HiPro-CT, a novel hierarchical probabilistic framework for 3D medical vision-language alignment. Unlike traditional point-based approaches, HiPro-CT maps images and texts into Gaussian probability distributions, utilizing variance to explicitly quantify uncertainty and enhance robustness against incompleteness and polysemy. We introduce a soft masked pooling strategy that performs weighted feature aggregation guided by anatomical masks, enabling precise organ-level alignment while preserving boundary context. Furthermore, we devise a hierarchical inclusion loss to enforce geometric constraints within the embedding space, ensuring that the deterministic global representations are geometrically grounded within the strictly more uncertain local distributions. Extensive experiments demonstrate that HiPro-CT significantly outperforms state-of-the-art deterministic baselines in zero-shot multi-abnormality detection and cross-modal retrieval, validating the efficacy of integrating fine-grained anatomical supervision with probabilistic representation learning.}
}


@InProceedings{pmlr-v315-engelson26a,
  title = 	 {Revealing and Reducing Morphological Biases Using Implicit Neural Representations for Medical Image Registration},
  author =       {Engelson, Sofija and Kahrs, Bennet and Kepp, Timo and Andresen, Julia and Handels, Heinz and Ehrhardt, Jan},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4026--4041},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/engelson26a/engelson26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/engelson26a.html},
  abstract = 	 {Deep learning has enhanced medical image analysis, yet models trained on imbalanced or non-representative populations often exhibit systematic biases, which can lead to substantial performance disparities across patient subgroups. Addressing these disparities is essential to ensure fair and reliable model deployment in clinical practice. Particularly in medical imaging, population-level biases can oftentimes be attributed to morphological rather than intensity differences, such as sex-related differences in organ volume. Given that morphological biases in neuroimaging data spuriously correlate with the disease label, we show, that bias detection based on general foundation model features (e.g., CLIP and BiomedCLIP) insufficiently captures morphological biases. Therefore, we introduce a bias detection and mitigation pipeline that performs subgroup discovery on deformation representations from a generalizable implicit neural representation (INR). This proof-of-concept study indicates improved performance when using deformation representations instead of general image features for bias detection. Furthermore, our results show that re-balancing the training dataset using the identified subgroups, complemented by INR-generated samples for augmentation, helps to mitigate the bias effect.}
}


@InProceedings{pmlr-v315-dong26a,
  title = 	 {LLaMA32-Med: Parameter-Efficient Adaptation of Multimodal LLMs for Medical Visual Question Answering},
  author =       {Dong, Wanqi and Ge, Jingze and Dong, Wanyue and Motani, Mehul},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4042--4056},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/dong26a/dong26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/dong26a.html},
  abstract = 	 {Artificial intelligence has shown great promise in healthcare, particularly in diagnostic support. While healthcare data is inherently multimodal, existing models struggle to fully leverage diverse clinical data, e.g., images and text. Although recent Multimodal Large Language Models (MLLMs) exhibit strong potential, their performance in medical scenarios is constrained by training on general-domain data and the high computational cost of full-parameter adaptation. In this work, we present a two-stage lightweight adaptation framework for fine-tuning general-purpose MLLMs on medical multimodal tasks. Building on the LLaMA 3.2 Vision-Instruct model, we adopt parameter-efficient fine-tuning techniques that update less than 2% of the model parameters. This enables the injection of domain-specific medical knowledge while requiring approximately 20 GB of GPU memory. Furthermore, we design task-specific and role-based prompting strategies to better guide medical visual understanding tasks. Experimental results show that our approach achieves performance comparable to or surpassing state-of-the-art methods while significantly outperforming the original general-domain model. Comparative evaluations with recent MLLMs highlight the strong adaptability of the LLaMA 3.2 Vision-Instruct backbone, validating its effectiveness as a foundation for practical multimodal medical AI systems.}
}


@InProceedings{pmlr-v315-singh26b,
  title = 	 {Bridging Classical and Learned Priors: A Hybrid Framework for Medical Image Enhancement},
  author =       {Singh, Peeyush Kumar and Singh, Sneha},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4057--4070},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/singh26b/singh26b.pdf},
  url = 	 {https://proceedings.mlr.press/v315/singh26b.html},
  abstract = 	 {Medical image enhancement faces a fundamental trade-off: classical methods preserve anatomical fidelity but over-smooth fine structures, while deep learning approaches risk generating unrealistic artifacts on limited clinical data. We introduce a hybrid framework combining classical preprocessing with pretrained diffusion priors for high-quality enhancement across modalities. Our method leverages pretrained Stable Diffusion model without requiring domain specific training. During inference, classical enhancement methods generate pseudo-labels. The frozen diffusion model leverages its learned priors to refine fine structures while gradient-based guidance anchors generation to the pseudo-label, preventing hallucinations. We demonstrate efficacy in ultrasound and MRI segmentation and achieve significant improvements in multi-class cardiac structure segmentation compared to baseline models. Critical insights include: pseudo-labels outperform multi-stage classical pipelines by providing differentiable guidance targets for diffusion models, testing segmentation models on enhanced images yields additional performance gains, pseudo-label guidance strength requires domain specific tuning to balance classical robustness with learned refinement. With extensive evaluation across imaging modalities, we show that pretrained diffusion models can enhance medical images while preserving the interpretability and diagnostic fidelity essential for clinical deployment.}
}


@InProceedings{pmlr-v315-andrusca26a,
  title = 	 {A Multi-Scale Inception-UNet with Structure-Aware Evaluation for Branch-Preserving Segmentation of Organoids},
  author =       {Andrusca, Sandra H. and Kie{\ss}ling, Christopher D. and Bausch, Andreas R.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4071--4084},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/andrusca26a/andrusca26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/andrusca26a.html},
  abstract = 	 {Branched organoids exhibit increasingly complex morphologies as they progress from simple spheroid states to highly ramified structures, making topology-preserving segmentation essential for quantitative biological analysis. Capturing thin protrusions and maintaining branch continuity remains challenging for classical UNet-based architectures, particularly in brightfield imaging where fine structures are easily blurred or disconnected. In this work, we present a multi-scale Inception-UNet designed to capture the heterogeneous spatial scales of branched organoids through parallel convolutional paths with complementary receptive fields. As a model system, we analyze brightfield pancreatic ductal adenocarcinoma (PDAC) organoids, a system known for strong morphological heterogeneity and invasive branching behavior randriamanantsoa2022pdacOrganoids, cultured using high-throughput Patternoid assays kurzbach2025patternoid that enable standardized imaging and robust quantitative analysis. To assess segmentation quality beyond region overlap, we combine Dice with the structure-aware clDice metric that directly probes branch integrity and topological continuity. Across deterministic seeds and strictly separated organoid positions, the Inception-UNet achieves the highest region-based Dice ($0.868 \pm 0.062$) and clDice ($0.545 \pm 0.123$), and most importantly, the strongest preservation of branch continuity compared to UNet and UNet++. These improvements become increasingly pronounced with growing morphological complexity. Overall, our results demonstrate that multi-scale feature extraction combined with topology-aware evaluation substantially improves segmentation of branched organoids and provides a robust foundation for downstream morphological and invasion-related analyses.}
}


@InProceedings{pmlr-v315-zech26a,
  title = 	 {Heteroscedastic Heatmap Regression for Reliable Pectoral Muscle Segmentation in Mammography},
  author =       {Zech, Paul and H\"{u}mmer, Christian and El-Zein, Benjamin and Syben, Christopher and Ritschl, Ludwig and Kappler, Steffen and Stober, Sebastian},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4085--4101},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/zech26a/zech26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/zech26a.html},
  abstract = 	 {Breast cancer remains a leading cause of mortality worldwide, making accurate mammography screening essential for early detection. An important preprocessing step in mammography is the accurate segmentation of the pectoral muscle, as it affects downstream tasks such as breast density estimation or automated exposure control. Existing automated segmentation methods, both traditional and deep learning-based, often lack reliable confidence measures, which becomes especially problematic in the presence of occlusions or visually confounding structures such as skin folds or other muscle fibers. To address this limitation, we propose a probabilistic framework that combines heatmap-based boundary regression with heteroscedastic uncertainty estimation to capture input-dependent variability. Our approach not only predicts the pectoral muscle boundary but also quantifies the associated uncertainty. While mainly producing unimodal predictions, the probabilistic heatmaps reveal multimodal patterns for confounding structures, further enhancing transparency in challenging cases. We demonstrate that our method provides robust and transparent means to achieve accurate segmentation while producing meaningful uncertainty estimates.}
}


@InProceedings{pmlr-v315-bader26a,
  title = 	 {MApLe: Multi-instance Alignment of Diagnostic Reports and Large Medical Images},
  author =       {Bader, Felicia and Seeb\"ock, Philipp and Bartashova, Anastasia and Attenberger, Ulrike and Langs, Georg},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4102--4116},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/bader26a/bader26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/bader26a.html},
  abstract = 	 {In diagnostic reports, experts encode complex imaging data into clinically actionable information. They describe subtle pathological findings that are meaningful in their anatomical context. Reports follow relatively consistent structures, expressing diagnostic information with few words that are often associated with tiny but consequential image observations. Standard vision language models struggle to identify the associations between these informative text components and small locations in the images. Here, we propose "MApLe", a multi-task, multi-instance vision language alignment approach that overcomes these limitations. It disentangles the concepts of anatomical region and diagnostic finding, and links local image information to sentences in a patch-wise approach. Our method consists of a text embedding trained to capture anatomical and diagnostic concepts in sentences, a patch-wise image encoder conditioned on anatomical structures, and a multi-instance alignment of these representations. We demonstrate that MApLe can successfully align different image regions and multiple diagnostic findings in free-text reports. We show that our model improves the alignment performance compared to state-of-the-art baseline models when evaluated on several downstream tasks.}
}


@InProceedings{pmlr-v315-salort-benejam26a,
  title = 	 {Endo-4DTS: Monocular 4D Scene Synthesis for Endoscopy via Deformable Triangle Splatting},
  author =       {Salort-Benejam, Laura and Agudo, Antonio},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4117--4133},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/salort-benejam26a/salort-benejam26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/salort-benejam26a.html},
  abstract = 	 {Endoscopy is an essential procedure in medical imaging, routinely applied for diagnostic, prognostic and therapeutic purposes. Developing robust methods for 3D reconstruction of endoscopic videos has the potential to improve the visualization of complex anatomies, increase diagnostic accuracy, and guide surgical procedures. Despite recent advancements the task remains highly challenging. The deformable nature of soft tissues makes classical computer-vision algorithms useless, and additional difficulties arise from the widespread use of monocular cameras, unknown camera parameters, occlusions, illumination changes, motion blur and other artifacts. In this work, we present Endo-4DTS, a novel self-supervised pipeline based on triangle splatting for 4D scene synthesis of deformable endoscopy scenes from monocular videos with a static camera, the first time this type of solution is proposed to endoscopic images and in time-varying tissues. Our approach represents the endoscopic environment with a canonical set of triangles, optimized jointly with a deformation network, enabling consistent 4D synthesis of dynamic tissues. We incorporate additional geometric and depth-based objectives that further guide learning in the challenging context of deformable endoscopic scenes. Experiments on several endoscopic videos with non-rigid tissues, occlusions and illumination changes, show that Endo-4DTS reliably captures soft-tissue deformations in endoscopic scenes. We demonstrate that Endo-4DTS consistently outperforms previous state-of-the-art methods across multiple metrics.}
}


@InProceedings{pmlr-v315-jena26a,
  title = 	 {The LUMirage: An independent evaluation of zero-shot performance in the LUMIR challenge},
  author =       {Jena, Rohit and Chaudhari, Pratik and Gee, James},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4134--4165},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/jena26a/jena26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/jena26a.html},
  abstract = 	 {The LUMIR challenge represents an important benchmark for evaluating deformable image registration methods on large-scale neuroimaging data. While the challenge demonstrates that modern deep learning methods achieve competitive accuracy on T1-weighted MRI, it also claims exceptional zero-shot generalization to unseen contrasts and resolutions—assertions that contradict established understanding of domain shift in deep learning. In this paper, we perform an independent re-evaluation of these zero-shot claims using rigorous evaluation protocols while addressing potential sources of instrumentation bias. Our findings reveal a more nuanced picture: (1) deep learning methods perform comparably to iterative optimization on in-distribution T1w images and even on human-adjacent species (macaque), demonstrating improved task understanding; (2) however, performance degrades significantly on out-of-distribution contrasts (T2, T2*, FLAIR), with Cohen’s d scores ranging from 0.7–1.5, indicating substantial practical impact on downstream clinical workflows; (3) deep learning methods face scalability limitations on high-resolution data, failing to run on 0.6mm isotropic images, while iterative methods benefit from increased resolution; and (4) deep methods exhibit high sensitivity to preprocessing choices. These results align with the well-established literature on domain shift and suggest that claims of universal zero-shot superiority require careful scrutiny. We advocate for evaluation protocols that reflect practical clinical and research workflows rather than conditions that may inadvertently favor particular method classes.}
}


@InProceedings{pmlr-v315-chen26c,
  title = 	 {Algorithms Trained on Normal Chest X-rays Can Predict Health Insurance Types},
  author =       {Chen, Chi-Yu and Abulibdeh, Rawan and Asgari, Arash and Ord\'o\~nez, Sebasti\'an Andr\'es Cajas and Celi, Leo Anthony and Goode, Deirdre and Hamidi, Hassan and McCague, Ned and Seyyed-Kalantari, Laleh and Sounack, Thomas and Kuo, Po-Chih},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4166--4181},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/chen26c/chen26c.pdf},
  url = 	 {https://proceedings.mlr.press/v315/chen26c.html},
  abstract = 	 {Artificial intelligence is revealing what medicine never intended to encode. Deep vision models, trained on chest X-rays, can now detect not only disease but also invisible traces of social inequality. In this study, we show that state-of-the-art architectures (DenseNet121, SwinV2-T, MedMamba) can predict a patient’s health insurance type, a strong proxy for socioeconomic status, from normal chest X-rays with significant accuracy (AUC $\approx$ 0.70 on MIMIC-CXR-JPG, 0.68 on CheXpert). The signal was unlikely contributed by demographic features by our machine learning study combining age, race, and sex labels to predict health insurance types. The signal also remains detectable when the model is trained exclusively on a single racial group. Patch-based occlusion reveals that the signal is diffuse rather than localized, embedded in the upper and mid-thoracic regions. This suggests that deep networks may be internalizing subtle traces of clinical environments, equipment differences, or care pathways; learning socioeconomic signals itself. These findings challenge the assumption that medical images are neutral biological data. By uncovering how models perceive and exploit these hidden social signatures, this work reframes fairness in medical AI: the goal is no longer only to balance datasets or adjust thresholds, but to interrogate and disentangle the social fingerprints embedded in clinical data itself.}
}


@InProceedings{pmlr-v315-peng26a,
  title = 	 {CrossPan: A Comprehensive Benchmark for Cross-Sequence Pancreas MRI Segmentation and Generalization},
  author =       {Peng, Linkai and Sun, Cuiling and Zhang, Zheyuan and Dou, Wanying and Aktas, Halil Ertugrul and Bejar, Andrea M and Keles, Elif and Gonda, Tamas and Wallace, Michael B and Zhou, Zongwei and Durak, Gorkem and Keswani, Rajesh N and Bagci, Ulas},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4182--4216},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/peng26a/peng26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/peng26a.html},
  abstract = 	 {Automatic pancreas segmentation is fundamental to abdominal MRI analysis, yet deep learning models trained on one MRI sequence often fail catastrophically when applied to another—a challenge that has received little systematic investigation. We introduce CrossPan, a multi-institutional benchmark comprising 1,386 3D scans across three routinely acquired sequences (T1-weighted, T2-weighted, and Out-of-Phase) from eight centers. Our experiments reveal three key findings. First, cross-sequence domain shifts are far more severe than cross-center variability: models achieving Dice scores above 0.85 in-domain collapse to near-zero ($<$0.02) when transferred across sequences. Second, state-of-the-art domain generalization methods provide negligible benefit under these physics-driven contrast inversions, whereas foundation models like MedSAM2 maintain moderate zero-shot performance through contrast-invariant shape priors. Third, semi-supervised learning offers gains only under stable intensity distributions and becomes unstable on sequences with high intra-organ variability. These results establish cross-sequence generalization—not model architecture or center diversity—as the primary barrier to clinically deployable pancreas MRI segmentation.}
}


@InProceedings{pmlr-v315-carannante26a,
  title = 	 {Testing the Trust: Verification and Validation of Bayesian Segmentation under Uncertainty},
  author =       {Carannante, Giuseppina and Bouaynaya, Nidhal C. and Dera, Dimah and Fathallah-Shaykh, Hassan M. and Rasool, Ghulam},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4217--4239},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/carannante26a/carannante26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/carannante26a.html},
  abstract = 	 {Deep learning has achieved state-of-the-art performance in medical image segmentation, yet safe clinical deployment requires rigorous verification and validation of model robustness, reliability, and uncertainty behavior. Bayesian segmentation methods are often viewed as more trustworthy because they provide uncertainty estimates that can support human decision-making, flag unreliable predictions, and mitigate risks in downstream clinical workflows. However, most prior studies evaluate these models primarily on clean test data, with limited assessment of robustness to perturbations, and without examining whether the predicted uncertainty meaningfully correlates with segmentation quality. In this work, we conduct a comprehensive and systematic evaluation of state-of-the-art deterministic and Bayesian segmentation models across multiple datasets, corruption types, and performance metrics. Beyond accuracy-based metrics such as DSC and HD95, we analyze over- and under-segmentation trends, predictive variance, and the relationship between uncertainty and segmentation correctness. Our results show that while all models behave similarly on clean or mildly corrupted data, performance diverges significantly as perturbations increase. Models that learn and propagate uncertainty during training tend to exhibit improved robustness under severe perturbations and uncertainty estimates that better correlate with segmentation errors, suggesting potential advantages for safety-critical deployment.}
}


@InProceedings{pmlr-v315-kondrateva26a,
  title = 	 {Benchmarking the Reproducibility of Brain Tissue Segmentation Across MRI Scanners},
  author =       {Kondrateva, Ekaterina and Mohamed, Abdalla Z and Barg, Sandzhi and Kofler, Florian},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4240--4271},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/kondrateva26a/kondrateva26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/kondrateva26a.html},
  abstract = 	 {Accurate and reproducible brain morphometry from structural magnetic resonance imaging is critical for monitoring neuroanatomical changes across time and imaging domains. Although deep learning has accelerated segmentation workflows, scanner-induced variability and limited reproducibility remain major obstacles, particularly in longitudinal and multi-site studies. In this study, we benchmark two state-of-the-art segmentation pipelines, FastSurfer and SynthSeg, integrated into FreeSurfer, one of the most widely adopted neuroimaging tools. Using two complementary datasets—a 17-year single-subject longitudinal cohort and a nine-site test–retest cohort—we quantify between-scan segmentation variability with region-wise overlap and distance measures, including the Dice similarity coefficient, surface Dice, the 95th percentile of the Hausdorff distance, and the mean absolute percentage error in regional volumes. Our results reveal up to 7–8% variation in the volumes of small subcortical structures such as the amygdala and ventral diencephalon, even under controlled test–retest conditions. This level of noise raises a critical question: can we reliably detect subtle longitudinal changes of 5–10% in small brain regions with volumes below 2 milliliters, given the magnitude of scanner- and site-induced morphometric variability? We further analyze how registration choices and interpolation modes contribute additional, although smaller, biases, and we show that surface-based quality filtering can remove outlier segmentations while preserving most scans and maintaining morphometric stability. This work provides a reproducible benchmark of modern FreeSurfer-based segmentation pipelines and highlights the need for harmonization and quality-control strategies to enable robust morphometry in real-world neuroimaging studies.}
}


@InProceedings{pmlr-v315-mushunuri26a,
  title = 	 {Deep Learning Based Emboli Detection Using Ultrasound Doppler Imaging},
  author =       {Mushunuri, Raghava Vinaykanth and Dahl, Cecilie Le Duc and Iversen, Elisabeth Krogstad and Vik, Sigrid Dannheim and Leth-Olsen, Martin and Torp, Hans and Nyrnes, Siri Ann and Kiss, Gabriel},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4272--4287},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/mushunuri26a/mushunuri26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/mushunuri26a.html},
  abstract = 	 {Accurate detection of embolic signals in the bloodstream is crucial for early diagnosis and prevention of cerebrovascular complications, and this work develops and evaluates an artificial intelligence–based system for automatic emboli detection in power Doppler imaging from NeoDoppler, aiming for robust and real-time performance. The study uses a four-stage experimental pipeline built on convolutional neural networks with transfer learning: an initial baseline model (Stage 1), an assessment of spatial generalisation (Stage 2), and a hybrid two-step strategy (Stage 3) that combines conventional High-Intensity Transient Signal (HITS) pre-detection with CNN-based classification, followed by a simplified preprocessing strategy in Stage 4, where single-channel images are replicated into three channels to match pre-trained CNN architectures; all models are trained with 5-fold cross-validation on 523 recordings from 25 patients and evaluated on unseen pilot recordings from the same cohort and additional abdominal surgery data. Across stages, performance improves progressively, with the hybrid two-step framework using the three-channel replication yielding strong results, achieving 96% sensitivity and 98% specificity on the pilot recording and 94% sensitivity and 71% specificity on the abdominal surgery recordings.We estimated 95% confidence intervals (CIs) using Wilson’s score for abdominal surgery recordings, with a CI of 0.730-0.99, demonstrating that the proposed approach is an efficient and interpretable solution for ultrasound-based emboli monitoring.}
}


@InProceedings{pmlr-v315-kenia26a,
  title = 	 {ReX-MLE: The Autonomous Agent Benchmark for Medical Imaging Challenges},
  author =       {Kenia, Roshan and Zhang, Xiaoman and Rajpurkar, Pranav},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4288--4315},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/kenia26a/kenia26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/kenia26a.html},
  abstract = 	 {Autonomous coding agents built on large language models (LLMs) can now solve many general software and machine learning tasks, but they remain ineffective on complex, domain-specific scientific problems. Medical imaging is a particularly demanding domain, requiring long training cycles, high-dimensional data handling, and specialized preprocessing and validation pipelines, capabilities not fully measured in existing agent benchmarks. To address this gap, we introduce ReX-MLE , a benchmark of 20 challenges derived from high-impact medical imaging competitions spanning diverse modalities and task types. Unlike prior ML-agent benchmarks, ReX-MLE evaluates full end-to-end workflows, requiring agents to independently manage data preprocessing, model training, and submission under realistic compute and time constraints. Evaluating state-of-the-art agents (AIDE, ML-Master, R$&$D-Agent) with different LLM backends (GPT-5, Gemini, Claude), we observe a severe performance gap: most submissions rank in the 0th percentile compared to human experts. Failures stem from domain-knowledge and engineering limitations. ReX-MLE exposes these bottlenecks and provides a foundation for developing domain-aware autonomous AI systems.}
}


@InProceedings{pmlr-v315-wang26h,
  title = 	 {Multicenter Morphometric Analysis of Stratum Corneum Nanotexture for Skin Barrier Assessment},
  author =       {Wang, Jen-Hung and Chu, Chia-Yu and Colombelli, Felipe and Du, Ching-Wen and Christensen, Maria Oberl\"ander and Pereda, Jorge and Jakasa, Ivone and Kezic, Sanja and Thyssen, Jacob P. and Hwu, Edwin En-Te and Miranda, Gisele},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4316--4341},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wang26h/wang26h.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wang26h.html},
  abstract = 	 {Stratum corneum nanotexture (SCN) has emerged as a promising non-invasive biomarker for quantifying skin barrier impairment and the severity of inflammatory skin diseases such as atopic dermatitis (AD). In this multicenter study, we analyzed stratum corneum tape-strip samples from 90 patients with AD and 30 healthy controls recruited in Taiwan and Denmark, yielding a heterogeneous dataset of more than 2,000 SCN images. Participants were evenly stratified into four AD severity groups defined by the Eczema Area and Severity Index (EASI), enabling robust evaluation of SCN-derived metrics across the full spectrum of disease severity. Previous studies have primarily relied on count-based measures to quantify the density of circular nano-size objects (CNOs) in SCN images from single-center cohorts, without leveraging instance-level segmentation or comprehensive morphometric profiling. In this study, we propose and validate a segmentation-based SCN analysis pipeline that integrates YOLOv12 with Segment Anything Model 3 (SAM3) for accurate CNO delineation in a multicenter setting. This framework enables the extraction of detailed morphometric descriptors and facilitates systematic evaluation of SCN-derived biomarkers for quantitative skin barrier assessment in AD.}
}


@InProceedings{pmlr-v315-gunawardhana26a,
  title = 	 {A Comprehensive Benchmarking and Systematic Analysis of Deep Learning Models for Sonomammogram Segmentation},
  author =       {Gunawardhana, Malitha and Zolek, Norbert},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4342--4355},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/gunawardhana26a/gunawardhana26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/gunawardhana26a.html},
  abstract = 	 {Accurate segmentation of breast lesions in sonomammograms supports computer assisted diagnosis and early breast cancer detection. Existing public ultrasound datasets contain duplicates, mislabeled cases, and non-breast images, which leads to unreliable model evaluation. To address this, we construct a curated multi-centre dataset of 3,494 images with expert-verified annotations and patient-level splits. Using this dataset, we define a unified benchmarking protocol and evaluate eleven representative architectures, including nnU Net variants, SegResNet, SwinUNETR, U Mamba, and SAMed. All models are trained and assessed under identical preprocessing, training, and evaluation settings. Performance is measured with Dice, Sensitivity, Specificity, Accuracy, and Hausdorff Distance metrics. We also analyse how loss function choice and training data volume influence performance. SAMed p512 obtains the best Dice score at 0.860 $\pm$ 0.141 and the lowest Hausdorff Distance at 3.896 $\pm$ 5.472. The benchmark provides a reproducible reference for breast ultrasound segmentation and clarifies how architecture design and data-related factors shape performance in this setting.}
}


@InProceedings{pmlr-v315-chakrabarty26a,
  title = 	 {Comparing SAM 2 and SAM 3 for Zero-Shot Segmentation of 3D Medical Data},
  author =       {Chakrabarty, Satrajit and Soni, Ravi},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4356--4383},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/chakrabarty26a/chakrabarty26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/chakrabarty26a.html},
  abstract = 	 {Foundation models, such as the Segment Anything Model (SAM), have heightened interest in promptable zero-shot segmentation. Although these models perform strongly on natural images, their behavior on medical data remains insufficiently characterized. While SAM 2 has been widely adopted for annotation in 3D medical workflows, the recently released SAM 3 introduces a new architecture that may change how visual prompts are interpreted and propagated. Therefore, to assess whether SAM 3 can serve as an out-of-the-box replacement for SAM 2 for zero-shot segmentation of 3D medical data, we present the first controlled comparison of both models by evaluating SAM 3 in its Promptable Visual Segmentation (PVS) mode using a variety of prompting strategies. We benchmark on 16 public datasets (CT, MRI, Ultrasound, endoscopy) covering 54 anatomical structures, pathologies, and surgical instruments. We further quantify three failure modes: prompt-frame over-segmentation, over-propagation after object disappearance, and temporal retention of well-initialized predictions. Our results show that SAM 3 is consistently stronger under click prompting across modalities, with fewer prompt-frame over-segmentation failures and slower prediction retention decay compared to SAM 2. Under bounding-box and mask prompts, performance gaps narrow in few structures of CT/MR and the models trade off termination behavior, while SAM 3 remains stronger on ultrasound and endoscopy sequences. The overall results position SAM 3 as the superior default choice for most medical segmentation tasks, while clarifying when SAM 2 remains a preferable propagator.}
}


@InProceedings{pmlr-v315-al-belmpeisi26a,
  title = 	 {Deep Learning for Liver Disease Stratification: Findings from UKBB MRI},
  author =       {Al-Belmpeisi, Rami and Sundgaard, Josefine Vilsb ll and Larsen, Peter Hj rringgaard and Dahl, Anders Bjorholm},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4384--4400},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/al-belmpeisi26a/al-belmpeisi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/al-belmpeisi26a.html},
  abstract = 	 {Metabolic dysfunction-associated steatotic liver disease (MASLD) and its progressive form, metabolic dysfunction-associated steatohepatitis (MASH), have become more prevalent, spurring interest in using magnetic resonance imaging (MRI) sequences for diagnosis. In this study, we propose a method that uses deep learning to diagnose MASLD and MASH with significant fibrosis from single-slice (2D) and volumetric (3D) MRI sequences that originate from the UK Biobank. In this paper, we focus on transparent decision-making. Our study shows that imposing anatomically informed constraints by using a liver segmentation mask on the network’s input has minimal impact on diagnostic performance. Still, it redirects attention to clinically relevant liver regions, preventing shortcut learning from extrahepatic features, such as subcutaneous fat. These constraints shift the focus of the model toward proton density fat fraction (PDFF) maps for healthy liver assessment, $T_1$ maps for MASLD diagnosis, and both sequences to identify MASH with significant fibrosis. Our top-performing models achieve AUCs of 0.89/0.96/0.79 for the diagnosis of the healthy/MASLD/MASH groups with significant fibrosis, respectively. Despite label noise and limited sequence specificity, which primarily hinder predictive performance in cases of MASH with significant fibrosis, the identified indicators are frequently located in liver regions consistent with prior understanding of disease progression. In conclusion, we find that 2D MRI sequences are sufficient for diagnosing MASLD/MASH with significant fibrosis, as performance decreases and computation time increases when using 3D volumes.}
}


@InProceedings{pmlr-v315-maruccio26a,
  title = 	 {Impact of uncertainty maps on manual editing of rectal cancer segmentation in radiotherapy},
  author =       {Maruccio, Federica Carmen and Sim\~oes, Rita and Cnossen, Fokie and Jamtheim Gustafsson, Christian and Conijn, Sanne and Couwenberg, Alice and Gerrets-van Noord, Suzan and de Jong, Inge and van Pelt, Vivian and Wiersema, Lisa and van Aalst, Jo\"elle and Sonke, Jan-Jakob and Brouwer, Charlotte L. and Janssen, Tomas},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4401--4447},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/maruccio26a/maruccio26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/maruccio26a.html},
  abstract = 	 {Uncertainty maps provide a quantitative and visual representation of the estimated confidence of Deep Learning (DL) models in contouring predictions and have been proposed to improve clinicians’ efficiency during manual review. However, uncertainty maps are not currently integrated into clinical workflows, and evidence on their actual benefit in clinical decision-making remains limited. This study investigates the impact of simulated uncertainty maps on clinicians’ behaviour during manual editing of high-quality clinical target volume (CTV) contours in rectal cancer radiotherapy. An inter-observer variability dataset of ten patients was used to simulate meaningful DL uncertainty maps and contours. Six clinicians edited the contours across two editing sessions, with and without uncertainty maps. For each session, editing time, editing amount, questionnaire responses, and interview feedback were collected to assess the impact both quantitatively and qualitatively. Editing time and editing amount were comparable with and without uncertainty maps, while both measures decreased significantly in the second editing session, indicating a learning effect from task repetition. Qualitative feedback showed that clinicians’ decisions were shaped more by human factors, such as workload, mood, memory and anchoring biases, than by the uncertainty maps. Moreover, the study revealed low clinician trust in the uncertainty maps, which were used primarily for confirmation rather than decision-making. The findings suggest that the value of uncertainty maps may be limited for high-quality contours and highlight the need to investigate their relevance for different use cases.}
}


@InProceedings{pmlr-v315-heyer26a,
  title = 	 {Evaluation of 3D Ultrasound Reconstruction and 2D/3D Segmentation for Neonatal Hip Dysplasia Screening},
  author =       {Heyer, Wiebke and Ott, Katharina and Weihsbach, Christian and Sorbi, Reza and Lange, Lisa and Lichtenstein, J\"urgen and Hell, Anna and Lippross, Sebastian and Hansen, Lasse and Heinrich, Mattias P.},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4448--4478},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/heyer26a/heyer26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/heyer26a.html},
  abstract = 	 {Early detection of developmental dysplasia of the hip relies heavily on the correct acquisition and interpretation of ultrasound images. Yet, conventional single-plane imaging provides only a limited view of the neonatal hip, is operator-dependent and sensitive to probe orientation. In this study, we present a clinically oriented validation of a dual-sweep 3D ultrasound approach aimed at improving anatomical coverage and simplifying the diagnostic process. Our dataset comprises 50 optically tracked acquisitions and 150 untracked freehand sweeps from newborns, enabling the reconstruction of volumetric representations of the hip from standard handheld 2D ultrasound. We evaluate 2D and 3D nnU-Net–based segmentation models to quantify how volumetric context influences the delineation of key joint structures. Results demonstrate that the combination of 2D slice-based and 3D volumetric segmentation yields the most robust performance, particularly in cases with anatomical variability or suboptimal sweep direction. The study also highlights remaining challenges, including motion artefacts and inconsistent sweep trajectories, that affect reconstruction quality.}
}


@InProceedings{pmlr-v315-mhanna26a,
  title = 	 {A comprehensive benchmark of graph neural networks, graph kernels, and classical machine learning approaches on rs-fMRI brain graphs},
  author =       {Mhanna, Razan and Achard, Sophie and Petersen, Alexander and Richiardi, Jonas},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4479--4495},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/mhanna26a/mhanna26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/mhanna26a.html},
  abstract = 	 {Resting-state functional MRI (rs-fMRI) provides a powerful lens through which large-scale brain organization can be examined by modeling functional connectivity as a graph. These functional brain graphs now form the basis of machine-learning applications in neuroscience, ranging from relatively straightforward classification problems to more challenging behavioral and cognitive prediction tasks. While graph neural networks (GNNs) have gained increasing attention in neuroimaging, the absence of a unified, reproducible benchmark comparing GNNs with classical machine-learning models and graph kernel methods, across heterogeneous datasets and tasks, has made it difficult to assess their relative strengths. In this work, we introduce a comprehensive benchmarking framework spanning four heterogeneous cohorts ($N = 1513$) and multiple classification tasks, including clinical diagnosis and phenotypic prediction. We systematically evaluate classical models, graph kernels, and representative GNN architectures under a rigorous repeated nested cross-validation design and assess pairwise differences using the corrected repeated k-fold test with false-discovery-rate control. Our results show that, for this class of relatively small graphs with fixed vertex ordering, well-tuned classical ML approaches and graph kernels are competitive with GNNs, while requiring substantially fewer computational resources. For instance, the Shortest-Path graph kernel achieves 0.98 accuracy on the COMA dataset, logistic regression reaches 0.81 accuracy and 0.63 MCC on HCP sex prediction, and all model families cluster closely on multi-site datasets such as ABIDE and ADHD, where no statistically significant differences emerge. All code, seeds, cross-validation folds, fold-specific hyperparameters, full prediction logs and computational-cost measurements are publicly released at to ensure full transparency and reproducibility. This benchmark provides practical guidance for model selection in rs-fMRI connectome analysis.}
}


@InProceedings{pmlr-v315-liu26e,
  title = 	 {Can Vision Language Models Track a Heartbeat?  A Benchmark on Frame-Level Echocardiogram Understanding},
  author =       {Liu, Dingming and Jabareen, Nabil and Lukassen, Soeren},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4496--4517},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/liu26e/liu26e.pdf},
  url = 	 {https://proceedings.mlr.press/v315/liu26e.html},
  abstract = 	 {Echocardiogram videos are among the most common and clinically vital imaging modalities in cardiovascular medicine. They capture dynamic cardiac motion, and their accurate functional assessment requires frame-level temporal precision. Ejection fraction (EF) is an essential metric for assessing cardiac function and is computed from the left-ventricular volumes at end-diastole (EDV) and end-systole (ESV), making its estimation inherently dependent on accurate frame-wise temporal reasoning. Gernal Vision Language Models (VLMs) have recently shown strong performance in general video understanding. However, whether they can reliably reason over the fine-grained temporal dynamics required for echocardiographic interpretation remains unclear. We benchmarked six state-of-the-art open-source VLMs, Gemma-3n, LLaVA-Interleave, LLaVA-NeXT-Video 7B/34B, and Qwen3-VL 8B/32B, on the clinically motivated task of frame-level EDV/ESV localization in apical four-chamber echocardiograms. All models performed poorly on this localization task, with errors far beyond clinically acceptable tolerances, and in some cases indistinguishably from random Monte Carlo baselines. To further test whether explicit structural guidance could compensate for limited temporal reasoning, we additionally provided left-ventricular segmentation overlays as auxiliary visual input for both tasks. However, even with segmentation cues, performance gains remained negligible in this tasks. Prompting the model to focus on masked areas only, omitting any medical context, did not lead to marked improvements. To reduce the complexity to pure size comparison, we further evaluated a simplified two-frame binary classification task in which each model must distinguish end-diastole (ED) from end-systole (ES). Despite this simplification, performance remained low for most models on original videos, only Qwen3-VL-32B reaches an accuracy of 0.711. Providing segmentation overlays and ignoring medical background knowledge only helped Qwen3-VL in both sizes reaches accuracy over 0.9, with other models resulting in random level. This work presents the first systematic evaluation of general-purpose VLMs on echocardiogram video analysis across progressively simplified temporal reasoning tasks. Our results reveal a fundamental limitation of current VLMs in frame-level cardiac ultrasound interpretation. This work highlights the importance of medical benchmarks for VLMs and the need for domain-specific temporal modeling in future medical VLMs. To facilitate benchmarking of VLMs on echocardiogram video analysis, we make the benchmark and all associated code publicly available here.}
}


@InProceedings{pmlr-v315-navet26a,
  title = 	 {On the Stability and Robustness of Vision Transformers for Neurodegenerative Disease Classification},
  author =       {Navet, Eloi and Giraud, R{\'e}mi and Mansencal, Boris and Coup{\'e}, Pierrick},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4518--4554},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/navet26a/navet26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/navet26a.html},
  abstract = 	 {Vision Transformers (ViTs) have recently been explored for structural MRI classification, motivated by their ability to capture non-local image structure. However, in limited and heterogeneous clinical cohorts, their weak inductive biases and sensitivity to training conditions often lead to high-variance behaviour. While binary settings such as cognitively normal vs. dementia are widely reported and typically exhibit moderate variability, we show that this stability does not extend to differential diagnosis. When increasing task complexity (e.g., controls vs. Alzheimer’s Disease vs. Frontotemporal Dementia), performance becomes sensitive to class imbalance and phenotype overlap, with greater variability driven by fewer samples per class, noisier labels, and increased inter-site heterogeneity. In this study, we investigate a stabilization protocol combining data augmentation, architectural constraints, and optimization strategies on multi-site MRI datasets. We assess how model variance evolves with task complexity using patient-level paired bootstrapping, calibration analysis, paired significance tests, and estimates of the probability of false outperformance to obtain uncertainty-aware comparisons across models. Our results highlight conditions under which Transformer-based classifiers can be consistently trained with limited neuroimaging data and illustrate that several performance gains disappear once stochastic variability is reported. These results emphasize that reliable differential diagnosis with ViTs requires both robust stabilization protocols to mitigate optimization noise and standardized uncertainty quantification beyond simple point-estimates.}
}


@InProceedings{pmlr-v315-abbas26a,
  title = 	 {UMamba-ProSSL: Self-Supervised Large-Scale Pretraining with Multi-Task UMamba Advances Prostate Cancer Detection in Biparametric MRI},
  author =       {Abbas, Syed Farhan and Larsen, Michael S. and Str{\o}msv{\aa}g, Arild and Bathen, Tone F. and Lindseth, Frank and Kiss, Gabriel and Elschot, Mattijs},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4555--4578},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/abbas26a/abbas26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/abbas26a.html},
  abstract = 	 {Accurate prostate cancer (PCa) diagnosis is crucial, as it remains one of the leading cause of mortality among men. Although prostate magnetic resonance imaging (MRI) has improved the diagnostic workflow, radiologists still face challenges due to inter-observer variability and limited specificity, leading to both over- and under-diagnosis. Deep learning methods have the potential to support radiologists, but their performance typically depends on large, high-quality labeled datasets that are often scarce and expensive to curate. In contrast, large volumes of unlabeled prostate MRI scans are routinely generated in clinical practice, making self-supervised learning (SSL) a compelling approach to exploit this abundant, untapped resource. However, SSL performance depends strongly on backbone architectures and effective pretext tasks. Moreover, the lack of large-scale standardized benchmarking further limits progress. In this study, we employ a state-of-the-art UMamba for prostate cancer detection and investigate several SSL strategies using a large in-house unlabeled prostate MRI dataset (N=2,431). Among the different pretraining methods, UMamba pretrained with masked autoencoders (MAE) achieved the best downstream performance, with an aggregated mean score of 0.780 (AUROC: 0.905, AP: 0.655) on the large-scale PI-CAI hidden testing set (N=1,000). This performance ranked first on the PI-CAI benchmark leaderboard at the time of evaluation. To further evaluate generalizability, we conducted an evaluation on the out-of-distribution Prostate158 (N=158) dataset, where MAE-pretrained UMamba achieved the best generalization performance, indicating robustness across different clinical centers and imaging protocols. These findings highlighting the strong potential of SSL, particularly MAE combined with UMamba for improving PCa detection accuracy and potentially reducing unnecessary biopsies.}
}


@InProceedings{pmlr-v315-mcconnell26a,
  title = 	 {Scalable Detection of Undiagnosed ILD in Population Screening: A Multi-Cohort Study using 3D Foundation Models},
  author =       {McConnell, Niccol\`o and Azimbagirad, Mehran and Cheng, Daryl O. and Yamada, Daisuke and Egashira, Ryoko and Chapman, Robert and McCabe, John and Wang, Shanshan and Lynch, David and Kinney, Greg and Vasudev, Pardeep and Taylor, Paul and Alexander, Daniel C. and Janes, Sam M. and Jacob, Joseph},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4579--4599},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/mcconnell26a/mcconnell26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/mcconnell26a.html},
  abstract = 	 {Undiagnosed interstitial lung disease (UILD), an early form of lung fibrosis, is increasingly detected in population-based low-dose computed tomography (LDCT) screening but remains systematically under-reported due to its subtle appearance. We developed and validated a foundation-model-augmented deep learning system for UILD detection across two of the largest thoracic CT cohorts worldwide: SUMMIT, the UK’s largest LDCT screening study ($>$11{,}000 scans), and COPDGene, a multi-centre US cohort spanning 21 scanners and $>$8{,}800 scans. We propose ViT-3D-TE, a multi-token 3D Vision Transformer designed to preserve both high-frequency focal texture and diffuse parenchymal change through CLS, MAX, and AVG token fusion. The model was initialised with TANGERINE, an open-source 3D masked autoencoder pretrained on 98{,}000 full-volume LDCT scans, providing volumetric priors essential for stable optimisation. ViT-3D-TE was trained solely on SUMMIT and evaluated on COPDGene without domain adaptation, and achieved strong performance (AUROC 0.9805, AUPRC 0.7699 internal; AUROC 0.9705, AUPRC 0.6170 external), representing 17$\times$ and 25$\times$ improvements over random baselines at clinically realistic cohort prevalences (4.6% and 2.5%). We further introduce ConvNeXt-2.5-MIL, a slice-based 2.5D alternative that performs competitively without relying on 3D foundation model pretraining. Together, these results provide, to our knowledge, the largest real-world validation to date of deep learning for UILD detection and demonstrate that foundation-model-enhanced 3D Transformers offer a practical and scalable pathway for integrating UILD detection into national LDCT screening workflows.}
}


@InProceedings{pmlr-v315-wohlrapp26a,
  title = 	 {Evaluating the Impact of Medical Image Reconstruction on Downstream AI Fairness and Performance},
  author =       {Wohlrapp, Matteo and Bubeck, Niklas and Rueckert, Daniel and Lotter, William},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4600--4638},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/wohlrapp26a/wohlrapp26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/wohlrapp26a.html},
  abstract = 	 {AI-based image reconstruction models are increasingly deployed in clinical workflows to improve image quality from noisy data, such as low-dose X-rays or accelerated MRI scans. However, these models are typically evaluated using pixel-level metrics like PSNR, leaving their impact on downstream diagnostic performance and fairness unclear. We introduce a scalable evaluation framework that applies reconstruction and diagnostic AI models in tandem, which we apply to two tasks (classification, segmentation), three reconstruction approaches (U-Net, GAN, diffusion), and two data types (X-ray, MRI) to assess the potential downstream implications of reconstruction. We find that conventional reconstruction metrics poorly track task performance, where diagnostic accuracy remains largely stable even as reconstruction PSNR declines with increasing image noise. Fairness metrics exhibit greater variability, with reconstruction sometimes amplifying demographic biases, particularly regarding patient sex. However, the overall magnitude of this additional bias is modest compared to the inherent biases already present in diagnostic models. To explore potential bias mitigation, we adapt two strategies from classification literature to the reconstruction setting, but observe limited efficacy. Overall, our findings emphasize the importance of holistic performance and fairness assessments throughout the entire medical imaging workflow, especially as generative reconstruction models are increasingly deployed.}
}


@InProceedings{pmlr-v315-hoq26a,
  title = 	 {Virtual-Eyes: Quantitative Validation of a Lung CT Quality-Control Pipeline for Foundation-Model Cancer Risk Prediction},
  author =       {Hoq, Md. Enamul and Larson-Prior, Linda and Prior, Fred},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4639--4663},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/hoq26a/hoq26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/hoq26a.html},
  abstract = 	 {Robust preprocessing is rarely quantified in deep-learning pipelines for low-dose CT (LDCT) lung cancer screening. We develop and validate Virtual-Eyes, a clinically motivated, 16-bit CT quality-control pipeline for NLST, and measure its differential impact on generalist foundation models versus specialist models. Virtual-Eyes enforces strict 512 $\times$ 512 resolution, rejects short or non-diagnostic series, and extracts a contiguous lung block using Hounsfield-unit filtering and bilateral lung-coverage scoring while preserving the original 16-bit DICOM grid. Using 765 NLST patients (182 cancer, 583 non-cancer), we evaluate RAD-DINO, Merlin, Sybil, and ResNet-18 under a leakage-free protocol. For RAD-DINO, preprocessing improves slice-level AUC from 0.576 to 0.610 and patient-level AUC from 0.646 to 0.683 (mean pooling) and 0.619 to 0.735 (max pooling), with improved calibration (Brier score 0.188 $\rightarrow$ 0.112). In contrast, Sybil and ResNet-18 degrade under Virtual-Eyes, revealing reliance on contextual or shortcut features, while Merlin shows limited transferability. Sensitivity analysis and uncertainty estimation confirm the robustness and stability of these findings.}
}


@InProceedings{pmlr-v315-duarte26a,
  title = 	 {Multi-site Benchmarking of Deep Learning Models for Intraparenchymal Hemorrhage Segmentation on NCCT},
  author =       {Duarte, Kau{\^e} T N and Sidhu, Abhijot S and Barros, Murilo C and Aslan, Taha and Zhang, Donghao and Zhang, Jianhai and Bhatt, Devansh and Karmur, Brij and AlShamrani, Mohamed and Qiu, Wu and Ganesh, Aravind and K Menon, Bijoy},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4664--4682},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/duarte26a/duarte26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/duarte26a.html},
  abstract = 	 {Intraparenchymal hemorrhage (IPH) is a critical and often fatal subtype of hemorrhagic stroke, requiring rapid and accurate diagnosis on non-contrast computed tomography (NCCT) scans for effective treatment. While deep learning (DL) models, particularly convolutional neural networks (CNNs), offer potential for automating IPH segmentation, their real-world clinical utility is often limited by the lack of explicit data integration across diverse hospital sites with varying imaging protocols. This study conducted a multi-site benchmarking of black{five} prominent CNN architectures: baseline U-Net, Attention U-Net, Feature Pyramid Network (FPN), black{Swin U-Net}, and Trans U-Net, for IPH segmentation on a heterogeneous dataset from 17 clinical sites. Models were rigorously evaluated using F-measure (a.k.a., Dice), Intersection over Union (IoU), and 95% Hausdorff Distance ($d_{H95}$). The advanced CNN variants (Attention U-Net, FPN, Trans U-Net) significantly outperformed the baseline U-Net in F-measure and IoU (e.g., FPN F-measure: $0.868$ vs. U-Net: $0.819$, $p<0.001$), with no significant difference among them. For boundary error, FPN reduced $d_{H95}$ compared to the baseline, whereas Trans U-Net showed improvement, though it was not significant. These models exhibited robust cross-site generalization across hemorrhage volumes, with minimal site-specific effects on performance. This study demonstrates that advanced CNN variants can be adopted for IPH segmentation to standardize and potentially accelerate IPH diagnosis.}
}


@InProceedings{pmlr-v315-issah26a,
  title = 	 {Detection versus Instance Segmentation for Multi-Species Malaria Diagnosis: A Head-to-Head Comparison and Multi-Dataset Validation of YOLOv12 Architectures with Small Object Optimization},
  author =       {Issah, Ahmed Tahiru and Seidu, Idaya and Mukamakuza, Carine},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4683--4702},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/issah26a/issah26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/issah26a.html},
  abstract = 	 {Automated malaria parasite detection using deep learning holds promise for addressing diagnostic gaps in resource-limited settings, yet most studies rely on single-dataset evaluations that fail to capture real-world variability. In this work, we rigorously validate YOLOv12-based architectures for malaria detection across diverse geographic and institutional contexts. We introduce a dual-head architecture combining instance segmentation with a high-resolution P2 detection head to target tiny ring-stage parasites. Our evaluation on a diverse Rwandan thick-smear dataset (2,739 images) and two external datasets from Ghana (Lacuna) and Nigeria (FASTMAL) reveals critical insights into model robustness. While the proposed YOLOv12-Seg-N-P2 model achieves state-of-the-art internal performance (mAP@50 $0.888$) and significantly improves detection of challenging P. vivax ($+10.9%$) and P. falciparum ring forms, external validation exposes severe domain shift, with performance dropping by $>80%$ on unseen datasets. We further demonstrate that while P2 heads enhance morphological precision on source data, they reduce zero-shot generalization, likely by overfitting to dataset-specific acquisition characteristics. We additionally evaluate white blood cell (WBC)-anchored stain normalization and pixel-scale rescaling as inference-time domain adaptation strategies. While WBC detection improves substantially (up to $+45%$ on Lacuna), P. falciparum detection remains critically low across both external datasets despite partial recovery on FASTMAL, confirming that preprocessing-based adaptation alone is insufficient for reliable cross-site parasite detection.}
}


@InProceedings{pmlr-v315-hays26a,
  title = 	 {Harmonizing MR Images Across 100+ Scanners: Multi-site Validation with Traveling Subjects and Real-world Protocols},
  author =       {Hays, Savannah P. and Zuo, Lianrui and Chaudhary, Muhammad Faizyab Ali and Bartz, Kathleen M. and Remedios, Samuel W. and Zhang, Jinwei and Zhuo, Jiachen and Bilgel, Murat and Saidha, Shiv and Mowry, Ellen M. and Newsome, Scott D. and Prince, Jerry L. and Dewey, Blake E. and Carass, Aaron},
  booktitle = 	 {Proceedings of The 9th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {4703--4721},
  year = 	 {2026},
  editor = 	 {Huo, Yuankai and Gao, Mingchen and Kuo, Chang-Fu and Jin, Yueming and Deng, Ruining},
  volume = 	 {315},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {08--10 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v315/main/assets/hays26a/hays26a.pdf},
  url = 	 {https://proceedings.mlr.press/v315/hays26a.html},
  abstract = 	 {Reliable harmonization of heterogeneous magnetic resonance (MR) image datasets, especially those acquired in pragmatic clinical trials, is critical to advance multi-center neuroimaging studies and translational machine learning in healthcare. We present an enhanced and rigorously validated version of the HACA3 harmonization algorithm, which we refer to as HACA3$^+$, incorporating key methodological enhancements: (1) an improved artifact encoder to better isolate and mitigate image artifacts, (2) background and foreground-sensitive attention mechanisms to increase harmonization specificity, and (3) extensive training using data spanning 100+ scanners from 64 independent sites, providing a broader diversity of scanners than other harmonization methods. Our study focuses on four commonly acquired MR image contrasts (T1-weighted, T2-weighted, proton density, & fluid-attenuated inversion recovery), reflecting realistic clinical protocols. We perform inter-site harmonization experiments using traveling subjects to assess the generalization and robustness of the harmonization model. We compare the results of the publicly available version of HACA3 and our implementation, HACA3$^+$. Downstream relevance is further established through whole brain segmentation and image imputation. Finally, we justify each enhancement through an ablation experiment.}
}