


@Proceedings{MIDL2025,
  title =     {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  booktitle = {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  editor =    {Tolga Tasdizen and Shireen Elhabian and Ronald Summers and Chen Chen and Lisa Koch and Yan Zhuang},
  publisher = {PMLR},
  series =    {Proceedings of Machine Learning Research},
  volume =    301
}



@InProceedings{pmlr-v301-tasdizen26a,
  title = 	 {Preface},
  author =       {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {i--xvi},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/tasdizen26a/tasdizen26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/tasdizen26a.html},
  abstract = 	 {Preface to the MIDL 2025 proceedings}
}



@InProceedings{pmlr-v301-anderson26a,
  title = 	 {Enhancing Post-Treatment Visual Acuity Prediction with Multimodal Deep Learning on Small-scale Clinical and OCT Datasets},
  author =       {Anderson, Matthew and Corona, Veronica and Stankiewicz, Agnieszka and Habib, Maged and Steel, David H. and Obara, Boguslaw},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1--14},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/anderson26a/anderson26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/anderson26a.html},
  abstract = 	 {Predicting visual acuity (VA) outcomes after treatment in diabetic macular edema (DME) is crucial for optimizing patient management but remains challenging due to the heterogeneity of patient responses and the limited availability of comprehensive datasets. While existing predictive models have shown promise, their clinical deployment is hindered by their reliance on large training datasets that are often unavailable in real-world settings. We address this challenge by developing a multimodal deep learning framework specifically designed for small-scale clinical cohorts. Our approach integrates optical coherence tomography (OCT) images with carefully selected clinical parameters through a cross-modal fusion architecture that leverages attention mechanisms to enhance feature interaction and predictive accuracy. We validate our framework across two clinically distinct real-world cohorts: treatment-naïve patients ($n=35$) receiving intensive anti-VEGF therapy and chronically treated patients ($n=20$) receiving sustained-release corticosteroid implants. This approach achieves mean absolute errors in post-treatment VA prediction of $3.07 \pm 0.82$ and $4.20 \pm 2.79$ Early Treatment Diabetic Retinopathy Study (ETDRS) letters, respectively, falling within the acceptable range of clinical measurement variability and meeting thresholds for statistically significant visual change detection with $\geq90%$ confidence. This work demonstrates that appropriately designed multimodal architectures can achieve clinically meaningful prediction accuracy even with limited datasets, offering a practical foundation for personalized DME management in typical clinical settings where large datasets are unavailable.}
}



@InProceedings{pmlr-v301-anglada-rotger26a,
  title = 	 {Two Heads Are Enough: DualU-Net, a Fast and Efficient Architecture for Nuclei Instance Segmentation},
  author =       {Anglada-Rotger, David and Jansat, Berta and Marques, Ferran and Pard\`as, Montse},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {15--29},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/anglada-rotger26a/anglada-rotger26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/anglada-rotger26a.html},
  abstract = 	 {Accurate detection and classification of cell nuclei in histopathological images are critical for both clinical diagnostics and large-scale digital pathology workflows. In this work, we introduce DualU-Net, a fully convolutional, multi-task architecture designed to streamline nuclei classification and segmentation. Unlike the widely adopted three-decoder paradigm of HoVer-Net, DualU-Net employs only two output heads: a segmentation decoder that predicts pixel-wise classification maps and a detection decoder that estimates Gaussian-based centroid density maps. By leveraging these two outputs, our model effectively reconstructs instance-level segmentations. The proposed architecture results in significantly faster inference, reducing processing time by up to x5 compared to HoVer-Net, while achieving classification and detection performance comparable to State-of-the-Art models. Additionally, our approach demonstrates greater computational efficiency than CellViT and NuLite. We further show that DualU-Net is more robust to staining variations, a common challenge in digital pathology workflows. The model has been successfully deployed in clinical settings as part of the DigiPatICS initiative, operating across eight hospitals within the Institut Catal{à} de la Salut (ICS) network, highlighting the practical viability of DualU-Net as an efficient and scalable solution for nuclei segmentation and classification in real-world pathology applications. The code and pretrained model weights are publicly available on https://github.com/davidanglada/DualU-Net.}
}



@InProceedings{pmlr-v301-arbel26a,
  title = 	 {Evaluation of Virtual Stain Multiplexed CD68 for Macrophage Detection in NSCLC PD-L1 Slides},
  author =       {Arbel, Elad and Ben-David, Oded and Remer, Itay and Ben-Dor, Amir and Rabkin, Daniela and Aviel-Ronen, Sarit and Aidt, Frederik and Hagedorn-Olsen, Tine and Jacobsen, Lars and Kersch, Kristopher and Christian, Jim and Nguyen, Quyen and Tsalenko, Anya},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {30--58},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/arbel26a/arbel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/arbel26a.html},
  abstract = 	 {Manual reading of tissue slides by pathologists serves both as a foundation for clinical decision-making and as a source of ground truth for training artificial intelligence (AI) models. However, challenges such as inter-observer variability, limited tissue availability, and complex annotation tasks often compromise reliability and scalability. This study exemplifies a broader trend in pathology: leveraging virtual staining and other AI-based methodologies to address these challenges. We applied virtual stain multiplexing to a challenging annotation task - macrophage identification in non-small cell lung cancer tissue PD-L1 IHC stains, demonstrating its ability to improve pathologist performance and inter-observer agreement. In six challenging regions selected from 49 curated whole slide images, virtual staining significantly increased macrophage detection consistency, with Fleiss\’{kappa} improving from -0.1 to 0.62, and enhanced overall accuracy, with the F1 score increasing from 0.13 to 0.65.These results highlight the potential use of AI-based virtual staining to assist pathologists reading slides, thereby improving consistency, enhancing accuracy, and alleviating the dependence on additional costly staining. Virtual stain multiplexing demonstrates a generalizable approach to improving pathologist performance through measurement-based AI tools, addressing broader needs for reproducibility and efficiency in diagnostic pathology.}
}



@InProceedings{pmlr-v301-bai26a,
  title = 	 {Chest-OMDL: Organ-specific Multidisease Detection and Localization in Chest Computed Tomography using Weakly Supervised Deep Learning from Free-text Radiology Report},
  author =       {Bai, Xuguang and Liu, Mingxuan and Chen, Yifei and Yang, Hongjia and Tian, Qiyuan},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {59--81},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/bai26a/bai26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/bai26a.html},
  abstract = 	 {Deep learning (DL) models designed to detect abnormalities in chest computed tomography (CT) reduce radiologists’ workload. However, training multidisease diagnostic models requires large expert-annotated datasets, significantly increasing model development cost. To address this challenge, we propose a weakly supervised learning (WSL) framework entitled Chest-OMDL for Organ-specific Multidisease Detection and Localization in chest CT. Chest-OMDL trains DL models using disease labels extracted by RadBERT from free-text radiology reports and multi-organ segmentation masks generated by the Segment Anything by Text (SAT) model, therefore reducing the need for manual annotation. Specifically, Chest-OMDL employs a Y-shaped Mamba model (Y-Mamba), comprising a feature extractor, an organ segmentation decoder, and a disease anomaly map generator. By incorporating multidisease anatomical knowledge, Y-Mamba is trained with a multi-task loss for organ-level weak supervision. Chest-OMDL was trained and validated on the large-scale CT-RATE dataset (25,692 non-contrast 3D chest CT scans from 21,304 patients) and tested on the external RAD-ChestCT dataset (3,630 scans), outperforming CT-CLIP (contrastive language-image pre-training) and CT-Net (full supervision). Code: \url{https://github.com/JasonW375/Chest-OMDL}}
}



@InProceedings{pmlr-v301-baljer26a,
  title = 	 {GAMBAS: Generalised-Hilbert Mamba for Super-resolution of Paediatric Ultra-Low-Field MRI},
  author =       {Baljer, Levente and Briski, Ula and Leech, Robert and Bourke, Niall J and Donald, Kirsten A and Bradford, Layla E and Williams, Simone R and Parkar, Sadia and Kaleem, Sidra and Osmani, Salman and Deoni, Sean CL and Williams, Steven CR and Moran, Rosalyn J and Robinson, Emma C. and V\'a\v{s}a, Franti\v{s}ek},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {82--99},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/baljer26a/baljer26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/baljer26a.html},
  abstract = 	 {Magnetic resonance imaging (MRI) is critical for neurodevelopmental research, however access to high-field (HF) systems in low- and middle-income countries is severely hindered by their cost. Ultra-low-field (ULF) systems mitigate such issues of access inequality, however their diminished signal-to-noise ratio limits their applicability for research and clinical use. Deep-learning approaches can enhance the quality of scans acquired at lower field strengths at no additional cost. For example, Convolutional neural networks (CNNs) fused with transformer modules have demonstrated a remarkable ability to capture both local information and long-range context. Unfortunately, the quadratic complexity of transformers leads to an undesirable trade-off between long-range sensitivity and local precision. We propose a hybrid CNN and state-space model (SSM) architecture featuring a novel 3D to 1D serialisation (GAMBAS), which learns long-range context without sacrificing spatial precision. We exhibit improved performance compared to other state-of-the-art medical image-to-image translation models. Our code is made publicly available at https://github.com/levente-1/GAMBAS.}
}



@InProceedings{pmlr-v301-batten26a,
  title = 	 {Vector Representations of Vessel Trees},
  author =       {Batten, James and Schaap, Michiel and Sinclair, Matthew and Bai, Ying and Glocker, Ben},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {100--126},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/batten26a/batten26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/batten26a.html},
  abstract = 	 {We introduce a novel framework for learning vector representations of tree-structured geometric data focusing on 3D vascular networks. Our approach employs two sequentially trained Transformer-based autoencoders. In the first stage, the Vessel Autoencoder captures continuous geometric details of individual vessel segments by learning embeddings for sampled points along each curve. In the second stage, the Vessel Tree Autoencoder encodes the topology of the vascular network as a single vector representation, leveraging the segment-level embeddings from the first model. A recursive decoding process ensures that the reconstructed topology is a valid tree structure. Compared to 3D convolutional models, this proposed approach substantially lowers GPU memory requirements, facilitating large-scale training. Experimental results on a 2D synthetic tree dataset and a 3D coronary artery dataset demonstrate superior reconstruction fidelity, accurate topology preservation, and realistic interpolations in latent space. Our scalable framework, named VeTTA, offers precise, flexible, and topologically consistent modeling of anatomical tree structures in medical imaging.}
}



@InProceedings{pmlr-v301-bereska26a,
  title = 	 {SACP: Spatially-Adaptive Conformal Prediction in Uncertainty Quantification of Medical Image Segmentation},
  author =       {Bereska, Jacqueline Isabel and Karimi, Hamed and Samavi, Reza},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {127--152},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/bereska26a/bereska26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/bereska26a.html},
  abstract = 	 {While Conformal Prediction provides statistical coverage guarantees, existing non-conformity measures fail to account for spatially varying importance of predictive uncertainty in medical image segmentation. In this paper, we incorporate spatial context near critical interfaces such as a vessel or critical organ in medical image segmentation. Our framework consists of three key components: (1) a base non-conformity score derived from segmentation model probabilities, (2) employing class-conditional calibration followed by a validation mechanism equipped with a distance-weighted scoring function that exponentially decays with distance from key interfaces, and (3) a prediction set construction method that preserves coverage guarantees while providing targeted uncertainty quantification in critical regions.While our approach is generalizable to different scenarios, for validation purposes, we employ tumor segmentation in pancreatic adenocarcinoma imaging from multiple medical centers. Results demonstrate that our method achieves the desired coverage levels while generating prediction sets that adaptively expand near critical interfaces.}
}



@InProceedings{pmlr-v301-bian26a,
  title = 	 {DiffRGenNet: Difference-aware Medical Report Generation},
  author =       {Bian, Minghao and Zhang, Kun and Zhao, Dexin and Zhou, S Kevin},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {153--166},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/bian26a/bian26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/bian26a.html},
  abstract = 	 {Medical report generation is a critical task in healthcare, aiming to automatically pro-duce accurate diagnostic reports from medical images, thereby alleviating the burden onradiologists. However, due to the high similarity among medical images of the same anatom-ical region and the substantial variations captured from the same region across different timepoints for individual patients, capturing these differences poses a significant challenge. Wepropose a Difference-aware Report Generation Network (DiffRGenNet), which retrievessimilar reports through image search, identifies differences using the Feature Diff module,and dynamically orchestrates global and local dependencies via the FlexiRoute AggregationModule to determine the optimal routing path for each sample, selecting the most suitablereport to describe the variations and connections. Finally, by leveraging the consistencyof classification information and the discrepancy information from the diff module, DiffR-GenNet enhances the ability to learn differences in rare diseases, generating more precisereports. Experiments demonstrate that DiffRGenNet outperforms existing methods on theMIMIC-CXR and IU X-Ray datasets, confirming its effectiveness and potential.}
}



@InProceedings{pmlr-v301-bo26a,
  title = 	 {RCSegNeXt: Efficient multi-scale ConvNeXt for rectal cancer segmentation from sagittal MRI scans},
  author =       {Bo, Wang and Xue, Ting and Pan, Leyang and Huang, Dingfu and Xiao, Yi and Fan, Li and Liu, Zaiyi and Liu, Shiyuan and Zhou, S Kevin},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {167--182},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/bo26a/bo26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/bo26a.html},
  abstract = 	 {Rectal cancer remains a critical global health challenge, significantly contributing to mor-bidity and mortality worldwide. Magnetic resonance imaging (MRI) in a sagittal planeoffers distinct advantages for rectal cancer diagnosis by providing detailed visualization ofthe rectum and its surrounding anatomy. However, automated segmentation of the rectumand associated tumors remains difficult due to tumor heterogeneity and complex anatom-ical structure, which necessitate multi-scale feature extraction. This study proposes RC-SegNeXt, a novel non-uniform pure-convolutional rectal cancer segmentation architecturethat combines shallow anisotropic stages with deep isotropic stages. The anisotropic stagesleverage AniNeXt blocks, designed with customized convolutional kernels and pooling op-erations to address the uneven spatial resolution inherent in MRI data. In the isotropicstages, an IsoNeXt block with a Scale-Aware Integration Module (SAIM) enables efficientmulti-scale feature fusion by directing information flow through constrained pathways. Thisdesign enhances computational efficiency while achieving superior segmentation accuracy.Experiments on two in-house datasets demonstrate the proposed method’s state-of-the-artperformances. Code will be open upon acceptance.}
}



@InProceedings{pmlr-v301-chan26a,
  title = 	 {Staging Liver Fibrosis with Hepatic Perivascular Adipose Tissue as a CT Biomarker},
  author =       {Chan, Skylar and Mathai, Tejas Sudharshan and Balamuralikrishna, Praveen T.S. and Batheja, Vivek and Liu, Jianfei and Lubner, Meghan G and Pickhardt, Perry J and Summers, Ronald},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {183--194},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/chan26a/chan26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/chan26a.html},
  abstract = 	 {Cirrhosis is the 12th leading cause of death in the US. There are several CT imaging signs of late fibrosis, such as redistribution of liver segment volume, increased liver nodularity, and periportal space widening. Timely intervention can reverse the progression of early hepatic fibrosis, but later stages are irreversible. We hypothesize that the perivascular adipose tissue (PVAT) around the portal vein arising from periportal space widening may also be predictive of liver fibrosis. In this work, a fully automated pipeline was developed to segment the liver, spleen, portal vein and its branches. The PVAT in the vicinity of the portal vein was identified. From these structures, CT imaging biomarkers (volume, attenuation, fat fraction) were computed. They were used to build uni- and multivariate logistic regression models for diagnosing advanced fibrosis and cirrhosis. The best multivariate model for cirrhosis achieved 93.3% AUC, 78.9% sensitivity, and 93.4% specificity. For advanced fibrosis, the multivariate model obtained 88.7% AUC, 84.2% sensitivity, and 73.7% specificity. The automated approach may be useful for population-based studies of metabolic disease and opportunistic screening.}
}



@InProceedings{pmlr-v301-chen26a,
  title = 	 {Equivariant Imaging Biomarkers for Robust Unsupervised Segmentation of Histopathology},
  author =       {Chen, Fuyao and Du, Yuexi and Zeevi, Tal and Dvornek, Nicha C and Onofrey, John A},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {195--213},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/chen26a/chen26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/chen26a.html},
  abstract = 	 {Histopathology evaluation of tissue specimens through microscopic examination is essential for accurate disease diagnosis and prognosis. However, traditional manual analysis by specially trained pathologists is time-consuming, labor-intensive, cost-inefficient, and prone to inter-rater variability, potentially affecting diagnostic consistency and accuracy. As digital pathology images continue to proliferate, there is a pressing need for automated analysis to address these challenges. Recent advancements in artificial intelligence-based tools such as machine learning (ML) models, have significantly enhanced the precision and efficiency of analyzing histopathological slides. However, despite their impressive performance, ML models are invariant only to translation, lacking invariance to rotation and reflection. This limitation restricts their ability to generalize effectively, particularly in histopathology, where images intrinsically lack meaningful orientation. In this study, we develop robust, equivariant histopathological biomarkers through a novel symmetric convolutional kernel via unsupervised segmentation. The approach is validated using prostate tissue micro-array (TMA) images from 50 patients in the Gleason 2019 Challenge public dataset. The biomarkers extracted through this approach demonstrate enhanced robustness and generalizability against rotation compared to models using standard convolution kernels, holding promise for enhancing the accuracy, consistency, and robustness of ML models in digital pathology. Ultimately, this work aims to improve diagnostic and prognostic capabilities of histopathology beyond prostate cancer through equivariant imaging.}
}



@InProceedings{pmlr-v301-cheng26a,
  title = 	 {Enhancing Contrastive Learning for Retinal Imaging via Adjusted Augmentation Scales},
  author =       {Cheng, Zijie and Li, Boxuan and Altmann, Andre and Keane, Pearse and Zhou, Yukun},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {214--225},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/cheng26a/cheng26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/cheng26a.html},
  abstract = 	 {Contrastive learning, a typical self-supervised learning strategy, operates on bringing similar data together while pushing dissimilar data apart in latent space. This approach extracts robust and discriminative representations, thus being widely used in natural computer vision tasks, such as object classification. However, unlike natural images, medical images (e.g., retinal images) tend to share substantial similarities in imaging area and anatomical tissues, leading to a denser distribution in latent space. As a result, the default use of strong augmentations in contrastive learning potentially exacerbates this intensive distribution in retinal images, making it difficult to distinguish between genuinely similar and dissimilar data, and therefore hindering model pre-training convergence. In this paper, we hypothesise that weaker augmentations are better suited to contrastive learning for medical image applications, and we investigate model performance under various augmentation strategies. Our study includes six publicly available retinal datasets covering multiple clinically relevant tasks. We assess the models\’{performance} and generalizability via extensive experiments. The model pre-trained with weak augmentation outperforms the one pre-trained with strong augmentation, achieving approximately a 6% increase in AUPR ($P$$<$0.001) and a 12.5% increase in sensitivity ($P$$<$0.001) on MESSIDOR-2. Similar improvements are observed across other datasets. Our findings suggest that optimizing the scale of augmentation is critical for enhancing the efficacy of contrastive learning in medical imaging. The model weights and relevant code are available at: https://github.com/ziijiecheng/Enhance-contrastive-SSL-for-Retinal-Imaging.}
}



@InProceedings{pmlr-v301-chung26a,
  title = 	 {Foundation Model Ensemble for Out-of-Distribution Generalization: Predicting Lymph Node Metastasis in Early Gastric Cancer Using Whole-Slide Imaging},
  author =       {Chung, Woojin and Park, Yujun and Nam, Yoonho},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {226--238},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/chung26a/chung26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/chung26a.html},
  abstract = 	 {Recent advances in deep learning have improved the practicality of automated analysis for whole-slide imaging. However, challenges remain in image analysis due to variations in imaging equipment, tissue preparation, staining protocols, and other variables. These variations hinder the generalizability of trained models to external datasets. Recently, foundation models trained on large-scale pathology datasets have been introduced by various research groups, demonstrating the potential to address this issue. Since each foundation model was trained on datasets collected from different sources under varying settings, the learned representations reflect different characteristics to some extent. These differences suggest that leveraging the information of multiple models could improve generalization and robustness compared to using a single model. In this study, we investigate foundation model ensembles for predicting lymph node metastasis in early gastric cancer across three different datasets. By comparing ensemble models with individual ones, we demonstrate that ensembling multiple foundation models improves performance in whole-slide imaging for both in-distribution and out-of-distribution data.}
}



@InProceedings{pmlr-v301-cigdem26a,
  title = 	 {Predicting the Year of Total Knee Replacement: A Transformer-Based Multimodal Approach},
  author =       {Cigdem, Ozkan and Soyak, Refik and Cho, Kyunghyun and Deniz, Cem M},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {239--252},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/cigdem26a/cigdem26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/cigdem26a.html},
  abstract = 	 {Accurate prediction of the year of total knee replacement (TKR) is challenging due tothe complex interplay of factors influencing the surgical decision. Current deep learningmodels often rely on single-modality data, limiting their predictive power. Multimodalapproaches integrating imaging and patient data offer the potential to improve predictionsand support clinical decisions. This study presents an end-to-end trained, transformer-based multimodal model that integrates MR imaging with tabular data, including clinicalvariables and image readings, to predict the year of TKR for each subject. Our model lever-ages cross-modal attention to fuse features from an image encoder with a self-supervisedpretrained tabular encoder, achieving the highest accuracy of 63.4% among tested mod-els. We evaluated its performance against three unimodal models and four multimodalfusion strategies, including simple concatenation, DAFT, and multimodal interaction. Theresults demonstrate that our model’s cross-modal interaction approach with pretrainedTabNet not only outperformed all unimodal models but also showed improvements overother multimodal fusion techniques, highlighting the effectiveness of cross-modal attentionfusion for integrating complex data modalities in TKR year prediction tasks. Source codeis available at https://github.com/denizlab/2025_MIDL_time2TKR.}
}



@InProceedings{pmlr-v301-corbetta26a,
  title = 	 {Understanding the Impact of Client Heterogeneity on Ordinal Classification in Federated Medical Image Analysis},
  author =       {Corbetta, Valentina and Beets-Tan, Regina and Cardoso, Jaime S and Silva, Wilson},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {253--279},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/corbetta26a/corbetta26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/corbetta26a.html},
  abstract = 	 {Deep learning methods have shown remarkable success in medical image classification, aiding in early disease detection and treatment. Many of these tasks, such as cancer staging or risk stratification, exhibit an inherent ordinal structure; however, existing solutions often reduce them to binary or purely nominal classifications, ignoring the valuable ordering information. Simultaneously, privacy and regulatory concerns have spurred the adoption of Federated Learning (FL), enabling collaborative model training without centralising sensitive patient data. Yet, FL in real-world medical scenarios faces significant challenges arising from heterogeneous client data, particularly when institutions differ widely in case severity or label distribution. In this work, we conduct the first in-depth study of Federated Ordinal Learning (FOL), introducing ordinal classification paradigms into FL pipelines and systematically evaluating their performance under increasing levels of data heterogeneity. We assess the benefits of ordinal classification within four FL frameworks: standard Federated Averaging (FedAvg) and three heterogeneity-focused approaches (FedProx, MOON, and FedALA). Our experiments reveal that ordinal methods can effectively maintain class ordering information even when institutional data exhibit severe imbalance or missing classes, offering valuable insights for developing robust, privacy-preserving AI systems in medical imaging. However, ordinal approaches still suffer from performance degradation in highly heterogeneous FL settings, underscoring the need for dedicated research on FL methods that explicitly account for ordinality.}
}



@InProceedings{pmlr-v301-das26a,
  title = 	 {SegResMamba: An Efficient Architecture for 3D Medical Image Segmentation},
  author =       {Das, Badhan Kumar and Singh, Ajay and Islam, Saahil and Zhao, Gengyan and Maier, Andreas},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {280--292},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/das26a/das26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/das26a.html},
  abstract = 	 {The Transformer architecture has opened a new paradigm in the domain of deep learning with its ability to model long-range dependencies and capture global context and has outpaced the traditional Convolution Neural Networks (CNNs) in many aspects. However, applying Transformer models to 3D medical image datasets presents significant challenges due to their high training time, and memory requirements, which not only hinder scalability but also contribute to elevated CO$_2$ footprint. This has led to an exploration of alternative models that can maintain or even improve performance while being more efficient and environmentally sustainable. Recent advancements in Structured State Space Models (SSMs) effectively address some of the inherent limitations of Transformers, particularly their high memory and computational demands. Inspired by these advancements, we propose an efficient 3D segmentation model for medical imaging called SegResMamba, designed to reduce computation complexity, memory usage, training time, and environmental impact while maintaining high performance. Our model uses less than half the memory during training compared to other state-of-the-art (SOTA) architectures, achieving comparable performance with significantly reduced resource demands.}
}



@InProceedings{pmlr-v301-deng26a,
  title = 	 {CASC-AI: Consensus-aware Self-corrective Learning for Cell Segmentation with Noisy Labels},
  author =       {Deng, Ruining and Yang, Yihe and Pisapia, David J and Liechty, Benjamin L and Zhu, Junchao and Xiong, Juming and Guo, Junlin and Lu, Zhengyi and Wang, Jiacheng and Yao, Xing and Yu, Runxuan and Zhang, Rendong and Rudravaram, Gaurav and Yin, Mengmeng and Sarder, Pinaki and Yang, Haichun and Huo, Yuankai and Sabuncu, Mert R.},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {293--309},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/deng26a/deng26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/deng26a.html},
  abstract = 	 {Multi-class cell segmentation in high-resolution gigapixel whole slide images (WSIs) is crucial for various clinical applications. However, training such models typically requires labor-intensive, pixel-wise annotations by domain experts. Recent efforts have democratized this process by involving lay annotators without medical expertise. However, conventional non-corrective approaches struggle to handle annotation noise adaptively because they lack mechanisms to mitigate false positives (FP) and false negatives (FN) at both the image-feature and pixel levels. In this paper, we propose a consensus-aware self-corrective learning that leverages the Consensus Matrix to guide its learning process. The Consensus Matrix defines regions where both the AI and annotators agree on cell and non-cell annotations, which are prioritized with stronger supervision. Conversely, areas of disagreement are adaptively weighted based on their feature similarity to high-confidence consensus regions, with more similar regions receiving greater attention. Additionally, contrastive learning is employed to separate features of noisy regions from those of reliable consensus regions by maximizing their dissimilarity. This paradigm enables the model to iteratively refine noisy labels, enhancing its robustness. Validated on one real-world lay-annotated cell dataset and two reasoning-guided simulated noisy datasets, our method demonstrates improved segmentation performance, effectively correcting FP and FN errors and showcasing its potential for training robust models on noisy datasets. The official implementation and cell annotations are publicly available at https://github.com/ddrrnn123/CASC-AI.}
}



@InProceedings{pmlr-v301-denner26a,
  title = 	 {Visual Prompt Engineering for Vision Language Models in Radiology},
  author =       {Denner, Stefan and Bujotzek, Markus Ralf and Bounias, Dimitrios and Zimmerer, David and Stock, Raphael and Maier-Hein, Klaus},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {310--326},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/denner26a/denner26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/denner26a.html},
  abstract = 	 {Medical image classification plays a crucial role in clinical decision-making, yet most models are constrained to a fixed set of predefined classes, limiting their adaptability to new conditions. Contrastive Language-Image Pretraining (CLIP) offers a promising solution by enabling zero-shot classification through multimodal large-scale pretraining. However, while CLIP effectively captures global image content, radiology requires a more localized focus on specific pathology regions to enhance both interpretability and diagnostic accuracy. To address this, we explore the potential of incorporating visual cues into zero-shot classification, embedding visual markers, such as arrows, bounding boxes, and circles, directly into radiological images to guide model attention. Evaluating across four public chest X-ray datasets, we demonstrate that visual markers improve AUROC by up to 0.185, highlighting their effectiveness in enhancing classification performance. Furthermore, attention map analysis confirms that visual cues help models focus on clinically relevant areas, leading to more interpretable predictions. To support further research, we use public datasets and provide our codebase and preprocessing pipeline, serving as a reference point for future work on localized classification in medical imaging.}
}



@InProceedings{pmlr-v301-deshmukh26a,
  title = 	 {MedDelinea: Scalable and Efficient Medical Image Segmentation via Controllable Diffusion Transformers},
  author =       {Deshmukh, Gayatri and Susladkar, Onkar Kishor and Jha, Debesh and Keles, Elif and Aktas, Halil Ertugrul and Medetalibeyoglu, Alpay and Ladner, Daniela P. and Borhani, Amir A. and Durak, Gorkem and Bagci, Ulas},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {327--345},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/deshmukh26a/deshmukh26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/deshmukh26a.html},
  abstract = 	 {We introduce MedDelinea, a novel medical image segmentation architecture that leverages a controllable module, drawing inspiration from ControlNet, within the Diffusion Transformers (DiT) framework. By doing so, we effectively address three key challenges inherent to segmentation tasks: (1) limited availability of labeled data, (2) variability in image modalities, and (3) the need for precise boundary delineation. MedDelinea is pre-trained on a large-scale medical dataset, thereby mitigating overfitting risks and enabling efficient transfer across diverse imaging scenarios with minimal fine-tuning requirements. The modular design of MedDelinea facilitates scalable and efficient computation, while maintaining high-quality segmentation performance in both supervised and zero-shot settings. Through extensive empirical evaluations on multiple datasets, we demonstrate that MedDelinea outperforms existing state-of-the-art segmentation approaches, showcasing its potential for robust and accurate medical image analysis}
}



@InProceedings{pmlr-v301-deshpande26a,
  title = 	 {A knowledge-based method for detecting network-induced shape artifacts in synthetic images},
  author =       {Deshpande, Rucha and Lago, Miguel and Subbaswamy, Adarsh and Kahaki, Seyed and Delfino, Jana G and Badano, Aldo and Zamzmi, Ghada},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {346--365},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/deshpande26a/deshpande26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/deshpande26a.html},
  abstract = 	 {The adoption of synthetic medical images for training or testing without thorough quality assessment risks introducing artifacts and unrealistic features that can mislead machine learning models and compromise clinical utility. This work introduces a novel knowledge-based method for detecting network-induced shape artifacts in synthetic images. The method can identify anatomically unrealistic images, detect shape artifacts irrespective of the generative model, and offer interpretability through its knowledge-driven design. We validated the method using two synthetic mammography datasets and demonstrated its effectiveness in flagging images with network-induced artifacts. A reader study further confirmed these findings and showed that the most anomalous images identified by the method were also flagged by human readers. This method provides a step toward the responsible use of synthetic data by ensuring synthetic images align with realistic morphological and anatomical constraints.}
}



@InProceedings{pmlr-v301-dombrowski26a,
  title = 	 {Can Diffusion Models Generalize? Privacy and Fairness Trade-offs for Medical Data Sharing.},
  author =       {Dombrowski, Mischa and Kainz, Bernhard},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {366--392},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/dombrowski26a/dombrowski26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/dombrowski26a.html},
  abstract = 	 {The recent surge in options for diffusion model-based synthetic data sharing offers significant benefits for medical research, provided privacy and fairness concerns are addressed.Generative models risk memorizing sensitive training samples, potentially exposing identifiable information.Simultaneously, underrepresented features – such as rare diseases, uncommon medical devices, or infrequent patient ethnicities – are often not learned well, creating unfair biases in downstream applications.Our work unifies these challenges by leveraging artificially generated fingerprints (SAFs) in the training data as a controllable test for memorization and fairness.Specifically, we measure whether a diffusion model reproduces these fingerprints verbatim (a privacy breach) or ignores them entirely (a fairness violation) and introduce an indicator t\’{to} quantify finished models for the likelihood of reproducing training samples.Extensive experiments on real and synthetic medical imaging datasets reveal that na{ï}ve diffusion model training can lead to privacy leaks or unfair coverage.By systematically incorporating SAFs and monitoring t\’, we demonstrate how to balance privacy and fairness objectives.Our evaluation framework provides actionable guidance for designing generative models that preserve patient anonymity without excluding underrepresented patient subgroups. Code is available at https://github.com/MischaD/Privacy.}
}



@InProceedings{pmlr-v301-edern26a,
  title = 	 {FLAIRBrainSeg: Fine-Grained Brain Segmentation Using FLAIR MRI Only},
  author =       {Edern, Le Bot and Giraud, R\'emi and Mansencal, Boris and Tourdias, Thomas and Manjon, Jose V and Coupe, Pierrick},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {393--406},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/edern26a/edern26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/edern26a.html},
  abstract = 	 {This paper introduces a novel method for brain segmentation using only FLAIR MRIs, specifically targeting cases where access to other imaging modalities is limited. By leveraging existing automatic segmentation methods, we train a network to approximate segmentations, typically obtained from T1-weighted MRIs. Our method, called FLAIRBrainSeg, produces segmentations of 132 structures and is robust to multiple sclerosis lesions. Experiments on both in-domain and out-of-domain datasets demonstrate that our method outperforms modality-agnostic approaches based on image synthesis, the only currently available alternative for performing brain parcellation using FLAIR MRI alone. This technique holds promise for scenarios where T1-weighted MRIs are unavailable and offers a valuable alternative for clinicians and researchers in need of reliable anatomical segmentation.}
}



@InProceedings{pmlr-v301-favero26a,
  title = 	 {Conditional Diffusion Models are Medical Image Classifiers that Provide Explainability and Uncertainty for Free},
  author =       {Favero, Gian Mario and Saremi, Parham and Kaczmarek, Emily and Nichyporuk, Brennan and Arbel, Tal},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {407--427},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/favero26a/favero26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/favero26a.html},
  abstract = 	 {Discriminative classifiers have become a foundational tool in deep learning for medical imaging, excelling at learning separable features of complex data distributions. However, these models often need careful design, augmentation, and training techniques to ensure safe and reliable deployment. Recently, diffusion models have become synonymous with generative modeling in 2D. These models showcase robustness across a range of tasks including natural image classification, where classification is performed by comparing reconstruction errors across images generated for each possible conditioning input. This work presents the first exploration of the potential of class conditional diffusion models for 2D medical image classification. First, we develop a novel majority voting scheme shown to improve the performance of medical diffusion classifiers. Next, extensive experiments on the CheXpert and ISIC Melanoma skin cancer datasets demonstrate that foundation and trained-from-scratch diffusion models achieve competitive performance against SOTA discriminative classifiers without the need for explicit supervision. In addition, we show that diffusion classifiers are intrinsically explainable, and can be used to quantify the uncertainty of their predictions, increasing their trustworthiness and reliability in safety-critical, clinical contexts. Further information is available on our project page: https://faverogian.github.io/med-diffusion-classifier.github.io/.}
}



@InProceedings{pmlr-v301-fay26a,
  title = 	 {Beyond the Prompt: Deploying Medical Foundation Models on Diverse Chest X-ray Populations},
  author =       {Fay, Louisa and Delbrouck, Jean-Benoit and K\"ustner, Thomas and Yang, Bin and Codella, Noel C and Lungren, Matthew P. and Langlotz, Curtis and Gatidis, Sergios},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {428--446},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/fay26a/fay26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/fay26a.html},
  abstract = 	 {Foundation models (FMs) have shown impressive performance in medical image analysis tasks, but their deployment in real-world clinical settings, especially across diverse patient populations such as adult and pediatric cases, remains challenging. Key open questions include optimal prompting techniques and strategies for model adaptation or fine-tuning for clinical use. In this study, we evaluated different approaches for deploying FMs in clinical scenarios for diverse patient populations. We use the lightweight, embedding-based vision-language FM $\textit{MedImageInsight}$ to predict pneumonia from chest X-rays, a condition common in both adult and pediatric patients.We observed a large variation in model predictive performance depending on the chosen prompt design, highlighting the importance of text prompt design for successful zero-shot (ZS) application. On in-domain datasets, we found performance differences of up to 46% in Matthews correlation coefficient (MCC) and 56% in true positive rates across different text prompts.By introducing text and vision embedding ensembles, we achieved substantial ZS improvements, outperforming training-based methods (fine-tuning, Linear Probe) in low-data scenarios by up to 43% for adults and 35% for pediatric populations (MCC). This ensembling strategy also promotes resource-efficient, equitable clinical use by supporting diverse demographic subgroups, achieving MCC improvements of 6% by sex, 17% by age, and 10% by race compared to linear probe.}
}



@InProceedings{pmlr-v301-gao26a,
  title = 	 {Machine Learning with Scarce Data: Ejection Fraction Prediction Using PLAX View},
  author =       {Gao, Zhiyuan and Yurk, Dominic and Abu-Mostafa, Yaser S.},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {447--457},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/gao26a/gao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/gao26a.html},
  abstract = 	 {We developed a machine learning model to predict left ventricular ejection fraction (LVEF/EF) from parasternal long-axis (PLAX) echocardiographic videos. Because public datasets with labeled PLAX videos are virtually non-existent, our work focuses on an innovative data generation strategy to overcome this scarcity. By leveraging a time-based correlation between clinical notes and echocardiographic videos, combined with fine-tuning view classifiers and proxy labeling, we effectively created a large labeled PLAX dataset and achieved a mean absolute error (MAE) of 6.86%. Given that Apical four-chamber methods, the clinical standard, report MAE values of 6%-7%, our results demonstrate that EF estimation from PLAX views is both feasible and clinically relevant.  This surpasses the performance of existing methods and provides a clinically useful solution for situations where apical views may not be feasible.  The EF labels for PLAX videos, derived from publicly available datasets, are accessible at https://github.com/Jeffrey4899/PLAX_EF_Labels_202501.}
}



@InProceedings{pmlr-v301-geissler26a,
  title = 	 {Multi-centric Comparison of Deep Learning Models for Lesion Detection in Breast MRI},
  author =       {Gei{\ss}ler, Kai and Wenzel, Markus and Diekmann, Susanne and von Busch, Heinrich and Grimm, Robert and Meine, Hans},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {458--474},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/geissler26a/geissler26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/geissler26a.html},
  abstract = 	 {Breast magnetic resonance imaging (MRI) is a common modality for diagnostic imaging in breast cancer, creating a need for automated image analysis to assist in early detection and diagnosis.In this study, we compare multiple deep learning-based segmentation and detection algorithms for lesion detection in dynamic contrast-enhanced (DCE) breast MRI. We utilized a large multi-centric dataset comprising T1-weighted DCE MR images from nine clinical sites across seven countries, encompassing diverse imaging characteristics and scanner types. We evaluated several models, including the standard nnU-Net, an adapted nnU-Net with modifications to reduce false positives, a coarse-resolution version thereof, the transformer-based SwinUNETR-V2, and nnDetection.The standard nnU-Net achieved a high lesion-level sensitivity of 83.8% but produced an average of 3.334 false positives per case, which is impractical for clinical use. The adapted (coarse) nnU-Net significantly reduced false positives to 0.666 (0.397) per case with a slight decrease in sensitivity to 79.9% (75.8%). SwinUNETR-V2 achieved comparable performance to the adapted nnU-Net. nnDetection outperformed nnU-Net in the high-sensitivity region, but performed worse than the adapted models in the lower-sensitivity region, with respect to false positives. To conclude, the nnU-Net again provides a good baseline, but our lesion detection task motivates adaptations to reduce the number of false positives.}
}



@InProceedings{pmlr-v301-germani26a,
  title = 	 {Mitigating analytical variability in fMRI with style transfer},
  author =       {Germani, Elodie and Maumet, Camille and Fromont, Elisa},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {475--493},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/germani26a/germani26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/germani26a.html},
  abstract = 	 {We propose a novel approach to facilitate the re-use of neuroimaging results by converting statistic maps across different functional MRI pipelines. We make the assumption that pipelines used to compute fMRI statistic maps can be considered as a style component and we propose to use different generative models, among which, Generative Adversarial Networks (GAN) and Diffusion Models (DM) to harmonize statistic maps across different pipelines. We explore the performance of multiple GAN and DM frameworks for unsupervised multi-domain style transfer. We developed an auxiliary classifier that distinguishes statistic maps from different pipelines, allowing us to validate pipeline transfer, but also to extend traditional sampling techniques used in DM to improve the transition performance. Our experiments demonstrate that our proposed methods are successful: pipelines can indeed be transferred as a style component, providing an important source of data augmentation for future studies.}
}



@InProceedings{pmlr-v301-gilliland26a,
  title = 	 {FERN: A Fetal Echocardiography Registration Network for 2D-to-3D Alignment},
  author =       {Gilliland, Paula Ramirez and Lloyd, David F A and Matthew, Jacqueline and Razavi, Reza and van Poppel, Milou PM and King, Andrew P. and Deprez, Maria},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {494--514},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/gilliland26a/gilliland26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/gilliland26a.html},
  abstract = 	 {2D Freehand echocardiography remains the primary imaging modality for routine fetal cardiac care, essential in the antenatal detection of Congenital Heart Disease (CHD). However, there is a lack of spatial context which requires 3D imaging. Current 3D methods, such as Spatio-Temporal Image Correlation (STIC), face limitations in success rate, image quality, and ease of use, and come at the cost of lower spatial and temporal resolution compared to 2D acquisitions. This work studies the feasibility of aligning real high spatial and temporal resolution 2D fetal echocardiography into a reference 3D space defined by lower resolution 3D STIC. FERN, a $\textbf{F}$etal $\textbf{E}$chocardiography $\textbf{R}$egistration $\textbf{N}$etwork, employs transformers for standard fetal echocardiography view alignment. The network is trained on simulated 2D slices derived from 3D volumes at end-diastole, and validated on real 2D acquisitions from fetuses with Coarctation of the Aorta and Right Aortic Arch diagnoses, achieving a mean Euclidean distance of 2.98 $\pm$ 1.27 mm on cardiac region-of-interest points between predicted and manually selected planes. Compared to manually aligned planes, improved image similarity to an average atlas is achieved, confirmed by blinded best plane selection. This work demonstrates that high spatial and temporal resolution 2D fetal echocardiography can be integrated into a 3D context provided by lower-resolution 3D acquisitions or fetal cardiac atlases, potentially resulting in a new 3D visualization tool for enhanced CHD diagnosis.}
}



@InProceedings{pmlr-v301-griebel26a,
  title = 	 {Segment Anything for Histopathology},
  author =       {Griebel, Titus and Archit, Anwai and Pape, Constantin},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {515--546},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/griebel26a/griebel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/griebel26a.html},
  abstract = 	 {Nucleus segmentation is an important analysis task in digital pathology. However, methods for automatic segmentation often struggle with new data from a different distribution, requiring users to manually annotate nuclei and retrain data-specific models. Vision foundation models (VFMs), such as the Segment Anything Model (SAM), offer a more robust alternative for automatic and interactive segmentation. Despite their success in natural images, a foundation model for nucleus segmentation in histopathology is still missing. Initial efforts to adapt SAM have shown some success, but did not yet introduce a comprehensive model for diverse segmentation tasks. To close this gap, we introduce PathoSAM, a VFM for nucleus segmentation, based on training SAM on a diverse dataset. Our extensive experiments show that it is the new state-of-the-art model for automatic and interactive nucleus instance segmentation in histopathology. We also demonstrate how it can be adapted for other segmentation tasks, including semantic nucleus segmentation. For this task, we show that it yields results better than popular methods, while not yet beating the state-of-the-art, CellViT. Our models are open-source and compatible with popular tools for data annotation. We also provide scripts for whole-slide image segmentation.}
}



@InProceedings{pmlr-v301-guichemerre26a,
  title = 	 {PixelCAM: Pixel Class Activation Mapping for Histology Image Classification and ROI Localization},
  author =       {Guichemerre, Alexis and Belharbi, Soufiane and Shateri, Mohammadhadi and McCaffrey, Luke and Granger, Eric},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {547--587},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/guichemerre26a/guichemerre26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/guichemerre26a.html},
  abstract = 	 {Weakly supervised object localization (WSOL) methods allow training models to classifyimages and localize ROIs. WSOL only requires low-cost image-class annotations yet provides a visually interpretable classifier, which is important in histology image analysis.Standard WSOL methods rely on class activation mapping (CAM) methods to producespatial localization maps according to a single- or two-step strategy. While both strategies have made significant progress, they still face several limitations with histology images. Single-step methods can easily result in under- or over-activation due to the limitedvisual ROI saliency in histology images and scarce localization cues. They also face thewell-known issue of asynchronous convergence between classification and localization tasks.The two-step approach is sub-optimal because it is constrained to a frozen classifier, limiting the capacity for localization. Moreover, these methods also struggle when appliedto out-of-distribution (OOD) datasets. In this paper, a multi-task approach for WSOLis introduced for simultaneous training of both tasks to address the asynchronous convergence problem. In particular, localization is performed in the pixel-feature space of animage encoder that is shared with classification. This allows learning discriminant featuresand accurate delineation of foreground/background regions to support ROI localizationand image classification. We propose PixelCAM, a cost-effective foreground/backgroundpixel-wise classifier in the pixel-feature space that allows for spatial object localization.Using partial-cross entropy, PixelCAM is trained using pixel pseudo-labels collected from apretrained WSOL model. Both image and pixel-wise classifiers are trained simultaneouslyusing standard gradient descent. In addition, our pixel classifier can easily be integratedinto CNN- and transformer-based architectures without any modifications. Our extensiveexperiments1 on GlaS and CAMELYON16 cancer datasets show that PixelCAM can improveclassification and localization performance when integrated with different WSOL methods.Most importantly, it provides robustness on both tasks for OOD data linked to differentcancer types, with large domain shifts between training and testing image data.}
}



@InProceedings{pmlr-v301-bai26b,
  title = 	 {Symmetric Multi-level Gradient-Inverse Consistency Network for Brain Image Registration with Large Deformation},
  author =       {Bai, Haoying and Che, Tongtong and Zhang, Jichang and Li, Shuyu},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {588--603},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/bai26b/bai26b.pdf},
  url = 	 {https://proceedings.mlr.press/v301/bai26b.html},
  abstract = 	 {Accurate and robust deformable image registration is crucial for brain image analysis. While deep learning has significantly advanced this field, existing methods often lack robustness for large deformations due to inter-subject variability, frequently requiring pre-registration and relying heavily on data-driven approaches. To address these limitations, we propose an end-to-end Symmetric Multis-level Gradient-Inverse Consistency Network (SM-GICNet) for accurate and robust brain image registration. SM-GICNet employs 1) a symmetric multi-level framework with an attention gate mechanism to capture complex deformations at multiple scales, 2) a symmetric registration strategy at each level to mitigate directional bias, and 3) a gradient inverse consistency strategy to reduce reliance on data-driven constraints and control deformation field complexity. Experimental results demonstrate that our method is able to eliminate the need for pre-registration andoutperforms state-of-the-art methods on large deformation registration tasks, achieving a Dice similarity coefficient of 0.797. The implementation of our SM-GICNet is available online at https://github.com/LSYLAB/SM-GICNet.git.}
}



@InProceedings{pmlr-v301-hays26a,
  title = 	 {An Unsupervised Approach for Artifact Severity Scoring in Multi-Contrast MR Images},
  author =       {Hays, Savannah and Zuo, Lianrui and Dewey, Blake E. and Remedios, Samuel and Zhang, Jinwei and Mowry, Ellen M. and Newsome, Scott D. and Carass, Aaron and Prince, Jerry L},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {604--614},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/hays26a/hays26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/hays26a.html},
  abstract = 	 {Quality assurance (QA) in magnetic resonance (MR) imaging is critical but remains a challenging and time-intensive process, particularly when working with large-scale, multi-site imaging datasets. Manual QA methods are subjective, prone to inter-rater variability, and impractical for high-throughput workflows. Existing automated QA methods often lack generalizability to diverse datasets or fail to provide interpretable insights into the causes of poor image quality. To address these limitations, we introduce an unsupervised and interpretable QA framework for multi-contrast MR images that quantifies artifact severity. By assigning a numerical score to each image, our method enables objective, consistent evaluation of image quality and highlights specific levels of artifact presence that can impair downstream analysis. Our framework employs an unsupervised contrastive learning approach, leveraging simulated artifact transformations, including random bias, noise, anisotropy, and ghosting, to train the model without requiring manual labels or preprocessing. A margin-based contrastive loss further enables differentiation between varying levels of artifact severity. We validate our framework using simulated artifacts on a public dataset and real artifacts on a private clinical dataset, demonstrating its robustness and generalizability for automatic MR image QA. By efficiently evaluating image quality and identifying artifacts prior to data processing, our approach streamlines QA workflows and enhances the reliability of subsequent analyses in both research and clinical settings.}
}



@InProceedings{pmlr-v301-heidrich26a,
  title = 	 {Curriculum Learning for Language-guided, Multi-modal Detection of Various Pathologies},
  author =       {Heidrich, Laurenz Adrian and Rastogi, Aditya and Upadhya, Priyank and Brugnara, Gianluca and Foltyn-Dumitru, Martha and Wiestler, Benedikt and Vollmuth, Philipp},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {615--638},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/heidrich26a/heidrich26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/heidrich26a.html},
  abstract = 	 {Pathology detection in medical imaging is crucial for radiologists, yet current approaches that train specialized models for each region of interest often lack efficiency and robustness. Furthermore, the scarcity of annotated medical data, particularly for diverse phenotypes, poses significant challenges in achieving generalizability. To address these challenges, we present a novel language-guided object detection pipeline that leverages curriculum learning strategies, chosen for their ability to progressively train models on increasingly complex samples, thereby improving generalization across pathologies, phenotypes, and modalities. We developed a unified pipeline to convert segmentation datasets into bounding box annotations, and applied two curriculum learning approaches - teacher curriculum and bounding box size curriculum - to train a Grounding DINO model. Our method was evaluated on different tumor types in MRI and CT scans and showed significant improvements in detection accuracy. The teacher and bounding box size curriculum learning approaches yielded a 4.9% AP and 5.2% AP increase over baseline, respectively. The results highlight the potential of curriculum learning to optimize medical image analysis and clinical workflow. The code is available at https://github.com/CCI-Bonn/CL4OD.}
}



@InProceedings{pmlr-v301-heinrich26a,
  title = 	 {Fast forward: Rephrasing 3D deformable image registration through density alignment and splatting},
  author =       {Heinrich, Mattias P and Bigalke, Alexander and Hansen, Lasse},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {639--652},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/heinrich26a/heinrich26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/heinrich26a.html},
  abstract = 	 {Unsupervised learning- and optimisation-based 3D registration has almost exclusively been approached using backward warping (interpolation) for transforming images. While this has practical advantages in particular the ease of implementation within common libraries it limits the robustness and accuracy in certain challenging scenarios. The alternative solution of forward splatting (extrapolation) is currently limited to very few applications, e.g. mesh or point cloud registration, requiring specific geometric learning architectures that are so far less efficient compared to dense 3D convolutional networks. In this work, we propose to use a straightforward forward splatting technique based on differentiable rasterisation. Contrary to prior work, we rephrase the problem of deformable image registration as a density alignment of rasterised volumes based on intermediate point cloud representations that can be automatically obtained through e.g. geometric vessel filters or surface segmentations. Our experimental validation demonstrates state-of-the-art performance over a wide range of registration tasks including intra- and inter-patient alignment of thorax and abdomen.}
}



@InProceedings{pmlr-v301-heinrich26b,
  title = 	 {BinaryFormer: 1-bit self-attention for long-range transformers in medical image segmentation and 3D diffusion models},
  author =       {Heinrich, Mattias P},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {653--667},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/heinrich26b/heinrich26b.pdf},
  url = 	 {https://proceedings.mlr.press/v301/heinrich26b.html},
  abstract = 	 {Vision transformers excel at capturing long-range interactions and have become essential for many medical image analysis tasks. Their computational cost, however, grows quadratically with sequence length - which is problematic for certain 3D problems, e.g.  high-resolution diffusion models that require dozens of sampling steps. Flash attention addressed some limitations by optimising local memory access, but left the computational burden high. Quantising weights and activations for convolutions and fully binary networks are possible, but have to be trained at higher precision and often resulted in performance drops. For transformers recent studies have been limited to quantising weights in linear layers or exploiting the potential of sparsity in self-attention scores. We present a novel scheme that not only enables a binary precision computation of the self-attention at inference time but also extends this to the training of transformers. To achieve differentiability we combine the bitwise Hamming distance with a learnable scalar query and key weighting. In theory this yields a 16-32x more resource-efficiency in arithmetic operations and memory bandwidth. We evaluate our model on three tasks with sequence lengths of N>1000: classification of images without patch-embedding, semantic 2D MRI segmentation and 3D high-resolution diffusion models for inpainting and synthesis. Our results demonstrate competitive performance and we provide an intuitive reasoning for the effectiveness of differentiable key-, query- weighting through Bernoulli sampling and distance interpolation. https://github.com/mattiaspaul/BinaryFormer}
}



@InProceedings{pmlr-v301-hisham26a,
  title = 	 {Family of Deep Image Prior Networks for Accelerated 3D LGE-MRI Acquisition with Enhanced Reconstruction},
  author =       {Hisham, Md Hasibul Husain and Elhabian, Shireen and Adluru, Ganesh and Arai, Andrew and Kholmovski, Eugene and Ranjan, Ravi and Dibella, Edward},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {668--678},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/hisham26a/hisham26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/hisham26a.html},
  abstract = 	 {Late Gadolinium Enhancement (LGE) MRI is essential for visualizing and treating left atrial fibrosis, but current protocols require lengthy acquisition times (7-20 minutes) and often produce suboptimal image quality. While recent advances in isotropic imaging have shown promise, scan times of 12-15 minutes still present clinical challenges. This study evaluates the efficacy of existing Deep Image Prior (DIP) frameworks for accelerated 3D LGE-MRI reconstruction. We comprehensively assess multiple DIP variants - vanilla DIP, reference-guided DIP, DIP with Total Variation, and self-guided DIP - on their ability to reconstruct high-quality isotropic (1.25mm$^3$) images from highly undersampled k-space data. Using data from 10 subjects, we demonstrate that self-guided DIP achieves superior reconstruction quality (PSNR: 32.8$\pm$1.2 dB, SSIM: 0.891$\pm$0.015 at 1/4th of acquisition time) compared to traditional compressed sensing and other DIP variants. Our evaluation shows that DIP-based reconstruction can maintain diagnostic quality with acquisition times reduced to 2-4 minutes, particularly in preserving thin left atrial wall details. These findings suggest that DIP-based methods could improve clinical workflow efficiency and patient comfort in high-resolution 3D LGE studies for atrial fibrillation patients.}
}



@InProceedings{pmlr-v301-jiang26a,
  title = 	 {Co-distilled attention guided masked image modeling with noisy teacher for self-supervised learning on medical images},
  author =       {Jiang, Jue and Rangnekar, Aneesh and Veeraraghavan, Harini},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {679--694},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/jiang26a/jiang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/jiang26a.html},
  abstract = 	 {Masked image modeling (MIM) is a highly effective self-supervised learning (SSL) approach to extract useful feature representations from unannotated data. Predominantly used random masking methods make SSL less effective for medical images due to the contextual similarity of neighboring patches, leading to information leakage and SSL simplification. Hence, we propose an attention guided masking mechanism within a co-distillation learning framework to selectively mask semantically co-occurring and discriminative patches, aiming to reduce information leakage and increase the difficulty of SSL pretraining. However, attention guided masking inevitably reduces the diversity of attention heads, which negatively impacts downstream task performance. To address this, we integrate a noisy teacher into the co-distillation framework (termed DAGMaN) to enable attentive masking while preserving high attention head diversity. We demonstrate the capability of DAGMaN on multiple tasks including full- and few-shot lung nodule classification, immunotherapy outcome prediction,  tumor segmentation, and unsupervised clustering of organs.}
}



@InProceedings{pmlr-v301-jin26a,
  title = 	 {Feature Attribution for Deep Learning Models through Total Variance Decomposition},
  author =       {Jin, Yinzhu and Zhu, Shen and Fletcher, Tom},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {695--715},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/jin26a/jin26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/jin26a.html},
  abstract = 	 {This paper introduces a new approach to feature attribution for deep learning models, quantifying the importance of specific features in model decisions. By decomposing the total variance of model decisions into explained and unexplained fractions, conditioned on the target feature, we define the feature attribution score as the proportion of explained variance. This method offers a solid statistical foundation and normalized quantitative results. When ample data is available, we compute the score directly from test data. For scarce data, we use constrained sampling with generative diffusion models to represent the conditional distribution at a given feature value. We demonstrate the method’s effectiveness on both a synthetic image dataset with known ground truth and OASIS-3 brain MRIs.}
}



@InProceedings{pmlr-v301-juturu26a,
  title = 	 {Unsupervised Cellular Anomaly Detection in Toxicological Histopathology},
  author =       {Juturu, Saketh and Raipuria, Geetank and Amaravadi, Raghav and Srivastava, Aman and Roy, Malini and Singhal, Nitin},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {716--734},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/juturu26a/juturu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/juturu26a.html},
  abstract = 	 {Irregularities in cellular representation play a crucial role in assessing drug-induced tissue alterations in toxicological histopathology studies. However, the process of annotating rare abnormal cellular variations for training supervised deep learning models presents significant challenges and lacks scalability. While anomaly detection is well-suited for this purpose, it has not yet been explored for cellular-level analysis. In this study, we evaluate cellular anomaly detection using datasets derived from the kidney and liver tissue of Wistar rats. Our findings show that a KNN-distance-based anomaly detection method significantly benefits from employing a feature extractor that has been pre-trained on extensive unsupervised histopathology datasets. When utilizing the best-performing feature extractor, the KNN-distance method surpasses state-of-the-art anomaly detection models by over 4.84% (AUC), including the denoising diffusion probabilistic model, in detecting cellular anomalies. Additionally, we assess the effectiveness of this method in identifying variations in anomalous cell counts between control and treated animal tissues within a toxicological study, revealing a statistically significant difference between the two dosage groups.}
}



@InProceedings{pmlr-v301-karanam26a,
  title = 	 {MORPH-LER: Log-Euclidean Regularization for Population-Aware Image Registration},
  author =       {Karanam, Mokshagna Sai Teja and Iyer, Krithika and Joshi, Sarang and Elhabian, Shireen},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {735--747},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/karanam26a/karanam26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/karanam26a.html},
  abstract = 	 {Spatial transformations that capture population-level morphological statistics are critical for medical image analysis. Commonly used smoothness regularizers for image registration fail to integrate population statistics, leading to anatomically inconsistent transformations. Inverse consistency regularizers promote geometric consistency but lack population morphometrics integration. Regularizers that constrain deformation to low-dimensional manifold methods address this. However, they prioritize reconstruction over interpretability and neglect diffeomorphic properties, such as group composition and inverse consistency. We introduce MORPH-LER, a Log-Euclidean regularization framework for population-aware unsupervised image registration. MORPH-LER, learns population morphometrics from spatial transformations to guide and regularize registration networks, ensuring anatomically plausible deformations. It features a bottleneck autoencoder that computes the principal logarithm of deformation fields via iterative square-root predictions. It creates a linearized latent space that respects diffeomorphic properties and enforces inverse consistency. By integrating a registration network with a diffeomorphic autoencoder, MORPH-LER produces smooth, meaningful deformation fields. The framework offers two main contributions: (1) a data-driven regularization strategy that incorporates population-level anatomical statistics to enhance transformation validity and (2) a linearized latent space that enables compact and interpretable deformation fields for efficient population morphometrics analysis. We validate MORPH-LER across two families of deep learning-based registration networks, demonstrating its ability to produce anatomically accurate, computationally efficient, and statistically meaningful transformations on the OASIS-1 brain imaging dataset.}
}



@InProceedings{pmlr-v301-korkmaz26a,
  title = 	 {I2I-Galip: Unsupervised Medical Image Translation Using Generative Adversarial CLIP},
  author =       {Korkmaz, Yilmaz and Patel, Vishal M.},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {748--762},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/korkmaz26a/korkmaz26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/korkmaz26a.html},
  abstract = 	 {Unpaired image-to-image translation is a challenging task due to the absence of paired examples, which complicates learning the complex mappings between the distinct distributions of the source and target domains. One of the most commonly used approaches for this task is cycle-consistent models which require the training of a new pair of generator-discriminator networks for each translation. In this paper, we propose a new image-to-image translation framework named Image-to-Image-Generative-Adversarial-CLIP (I2I-Galip) where we utilize pre-trained multi-modal foundation models to mitigate the need of separate generator-discriminator pairs for each source-target mapping while achieving better and more efficient multi-domain translation. By utilizing the massive knowledge gathered during pre-training a foundation model, our approach makes use of a single lightweight generator network with $\approx$13M parameters for the multi-domain image translation task. Comprehensive experiments on translation performance in public MRI and CT datasets show the superior performance of the proposed framework over the existing approaches.}
}



@InProceedings{pmlr-v301-kormann26a,
  title = 	 {HIEGNet: A Heterogenous Graph Neural Network Including the Immune Environment in Glomeruli Classification},
  author =       {Kormann, Niklas and Ramuz, Masoud and Nisar, Zeeshan and Schaadt, Nadine S. and Annuth, Hendrik and Doerr, Benjamin and Feuerhake, Friedrich and Lampert, Thomas and Lutzeyer, Johannes F.},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {763--786},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/kormann26a/kormann26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/kormann26a.html},
  abstract = 	 {Graph Neural Networks (GNNs) have recently been found to excel in histopathology. However, an important histopathological task, where GNNs have not been extensively explored, is the classification of glomeruli health as an important indicator in nephropathology. This task presents unique difficulties, particularly for the graph construction, i.e., the identification of nodes, edges, and informative features. In this work, we propose a pipeline composed of different traditional and machine learning-based computer vision techniques to identify nodes, edges, and their corresponding features to form a heterogeneous graph. We then proceed to propose a novel heterogeneous GNN architecture for glomeruli classification, called HIEGNet, that integrates both glomeruli and their surrounding immune cells. Hence, HIEGNet is able to consider the immune environment of each glomerulus in its classification. Our HIEGNet was trained and tested on a dataset of Whole Slide Images from kidney transplant patients. Experimental results demonstrate that HIEGNet outperforms several baseline models and generalises best between patients among all baseline models. Our implementation is publicly available at https://github.com/nklsKrmnn/HIEGNet.git.}
}



@InProceedings{pmlr-v301-krishnan26a,
  title = 	 {Anatomy-Guided Multi-Path CycleGAN for Lung CT Kernel Harmonization},
  author =       {Krishnan, Aravind and Li, Thomas and Remedios, Lucas Walker and Xu, Kaiwen and Zuo, Lianrui and Sandler, Kim L. and Maldonado, Fabien and Landman, Bennett Allan},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {787--802},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/krishnan26a/krishnan26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/krishnan26a.html},
  abstract = 	 {Accurate quantitative measurement in lung computed tomography (CT) imaging often relies on consistent kernel reconstruction across scanners and manufacturers. Harmonization can reduce measurement variability caused by heterogeneous reconstruction kernels; however, harmonization across different manufacturers and scanners remains challenging due to significant differences in reconstruction protocol and positional alignment of subjects, often resulting in anatomical hallucinations. To address this, we propose a multi-path cycleGAN framework that incorporates multi-region anatomical labels and a tissue statistic loss as anatomical regularization to preserve structural integrity during harmonization. We trained our model on 100 scans each of four representative reconstruction kernels from the National Lung Screening Trial (NLST) dataset and evaluated it on 240 withheld scans. Experimental results demonstrate superior performance of our method in both within manufacturer harmonization and cross-manufacture harmonization: Harmonizing hard-to-soft kernel images within a single manufacturer significantly reduces emphysema measurement discrepancies (p < 0.05). Across manufacturers, harmonizing all kernels to a reference soft kernel yields consistent emphysema quantification (p > 0.05) and preserves anatomical structures, as demonstrated by improved Dice similarity coefficient in skeletal muscle and subcutaneous adipose tissue between harmonized and unharmonized images. These findings demonstrate that segmentation-driven anatomical regularization effectively addresses cross-manufacturer discrepancies, ensuring robust quantitative imaging. We release ourcode and model at https://github.com/MASILab/AnatomyconstrainedMultipathGAN.}
}



@InProceedings{pmlr-v301-kuipers26a,
  title = 	 {Self-Supervised Synthetic Cerebral Vessel Tree Generation using Semantic Signed Distance Fields},
  author =       {Kuipers, Thijs P. and Konduri, Praneeta R. and Bekkers, Erik J and Marquering, Henk},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {803--820},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/kuipers26a/kuipers26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/kuipers26a.html},
  abstract = 	 {Advances in in-silico clinical trails for the development of novel treatment and devices for acute ischemic stroke have driven the creation of synthetic virtual patient populations to address the lack of large real-world datasets. Recent work proposed a method for generating semantic vascular centerline tree of the major cerebral arteries using pointcloud diffusion. However, this approach relies on separate post-processing algorithms to reconstruct the vessel tree topology, which does not generalize well to more topologically complex trees. To overcome this limitation, we introduce semantic signed distance fields for modeling cerebral vessel trees in a fully self-supervised manner. Our approach bypasses the need for separate reconstruction of the tree topology, and can be trained directly on shape-surfaces. Our method combines a variational autoencoder for encoding shapes to robust latent shape representations with a latent-diffusion model for generating synthetic vessel trees.  By generating surface geometry directly, our approach eliminates the need for post-processing steps, enabling the generation of high-quality and topologically complex cerebral vessel trees.}
}



@InProceedings{pmlr-v301-kulkarni26a,
  title = 	 {Towards Resource-Efficient Streaming of Large-Scale Medical Image Datasets for Deep Learning},
  author =       {Kulkarni, Pranav and Kanhere, Adway and Siegel, Eliot and Yi, Paul and Parekh, Vishwa Sanjay},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {821--837},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/kulkarni26a/kulkarni26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/kulkarni26a.html},
  abstract = 	 {Large-scale medical imaging datasets have accelerated deep learning (DL) for medical image analysis. However, the large scale of these datasets poses a challenge for researchers, resulting in increased storage and bandwidth requirements for hosting and accessing them. Since different researchers have different use cases and require different resolutions or formats for DL, it is neither feasible to anticipate every researcherś needs nor practical to store data in multiple resolutions and formats. To that end, we propose the Medical Image Streaming Toolkit (MIST), a format-agnostic database that enables streaming of medical images at different resolutions and formats from a single high-resolution copy. We evaluated MIST across eight popular, large-scale medical imaging datasets spanning different body parts, modalities, and formats. Our results showed that our framework reduced the storage and bandwidth requirements for hosting and downloading datasets without impacting image quality. We demonstrate that MIST addresses the challenges posed by large-scale medical imaging datasets by building a data-efficient and format-agnostic database to meet the diverse needs of researchers and reduce barriers to DL research in medical imaging.}
}



@InProceedings{pmlr-v301-kumar26a,
  title = 	 {PRISM: High-Resolution & Precise Counterfactual Medical Image Generation using Language-guided Stable Diffusion},
  author =       {Kumar, Amar and Kriz, Anita and Havaei, Mohammad and Arbel, Tal},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {838--863},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/kumar26a/kumar26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/kumar26a.html},
  abstract = 	 {Developing reliable and generalizable deep learning systems for medical imaging faces significant obstacles due to spurious correlations, data imbalances, and limited text annotations in datasets. Addressing these challenges requires architectures robust to the unique complexities posed by medical imaging data. The rapid advancements in vision-language foundation models within the natural image domain prompt the question of how they can be adapted for medical imaging tasks. In this work, we present PRISM, a framework that leverages foundation models to generate high-resolution, language-guided medical image counterfactuals using Stable Diffusion. Our approach demonstrates unprecedented precision in selectively modifying spurious correlations (the medical devices) and disease features, enabling the removal and addition of specific attributes while preserving other image characteristics. Through extensive evaluation, we show how PRISM advances counterfactual generation and enables the development of more robust downstream classifiers for clinically deployable solutions. To facilitate broader adoption and research, we make our code publicly available at https://github.com/Amarkr1/PRISM.}
}



@InProceedings{pmlr-v301-larsen26a,
  title = 	 {Prostate Cancer Detection in Bi-Parametric MRI using Zonal Anatomy-Guided U-Mamba with Multi-Task Learning},
  author =       {Larsen, Michael S. and Abbas, Syed Farhan and Kiss, Gabriel and Elschot, Mattijs and Bathen, Tone F. and Lindseth, Frank},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {864--881},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/larsen26a/larsen26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/larsen26a.html},
  abstract = 	 {Prostate cancer (PCa) remains a leading cause of cancer-related morbidity, emphasizing the need for accurate and non-invasive diagnostic tools. While deep learning models have advanced PCa detection in magnetic resonance imaging (MRI), they often fail to integrate anatomical knowledge. This study evaluates U-Mamba, a deep learning architecture designed to enhance long-range dependency modeling with linear time complexity, for PCa detection. Furthermore, a multi-task learning (MTL) extension, U-Mamba MTL, is introduced to incorporate prostate zonal anatomy, aligning with clinical diagnostic workflows. The models were assessed using diverse datasets, including the PI-CAI hidden tuning cohort (N=100) and an in-house collected out-of-distribution cohort (N=200). Results demonstrate that U-Mamba achieves state-of-the-art detection performance, while U-Mamba MTL further improves PCa detection through the auxiliary zonal segmentation task. These findings highlight the potential of integrating U-Mamba with anatomical context to improve PCa detection. The code and model weights are available at https://github.com/mokkalokka/U-MambaMTL.}
}



@InProceedings{pmlr-v301-laufer26a,
  title = 	 {Synthetic Data Generated from CT Scans for Patient Pose Assessment},
  author =       {Laufer, Manuel and Mairh\"ofer, Dominik and Sieren, Malte and Gerdes, Hauke and dos Reis, Fabio Leal and Bischof, Arpad and K\"aster, Thomas and Barth, Erhardt and Barkhausen, J\"org and Martinetz, Thomas},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {882--895},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/laufer26a/laufer26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/laufer26a.html},
  abstract = 	 {An adequate diagnostic quality of radiographs is essential for reliable diagnoses and treatment planning. The patientś pose during radiography is one of the most important factors determining the diagnostic quality. Since patient positioning is difficult and not standardized, an automated AI-based approach using depth images to automatically assess the patientś pose before the radiograph has been taken would be helpful.Due to regulatory hurdles, however, it is difficult in practice to acquire the required depth images and corresponding radiographs.In this paper, we present a framework that can generate such training data synthetically from Computer Tomography scans. We further show that by pretraining on our generated synthetic dataset consisting of 3077 image pairs of upper ankle joints, the pose assessment of real upper ankle joints can be improved by up to 11 percentage points.}
}



@InProceedings{pmlr-v301-lee26a,
  title = 	 {LUV-Net: Multi-Pattern Lung Ultrasound Video Classification through Pattern-Specific Attention with Efficient Temporal Feature Extraction},
  author =       {Lee, Jung Hoon and Kim, Changi and Lee, Jinwoo and Yoon, Si Mong and Lee, Kyung-Eui and Park, Hyun-Jun and Hyung, Kwonhyung and Park, Chang Min},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {896--913},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/lee26a/lee26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/lee26a.html},
  abstract = 	 {Lung ultrasound (LUS) has emerged as a crucial bedside imaging tool for critical care, yet its interpretation remains challenging due to its artifact-based nature and high operator dependency. While deep learning approaches offer promising solutions for LUS pattern analysis, existing methods are limited by their focus on single-pattern recognition or disease-specific classification, and inadequate handling of temporal dynamics in video-based models. We propose LUV-Net (Lung Ultrasound Video Network), a novel deep learning model for multi-label classification of LUS patterns, combining pattern-specific attention mechanisms with temporal feature extraction. Our approach consists of two key modules: a spatial feature extraction module utilizing independent pattern-specific attention mechanisms, and a temporal feature extraction module designed to capture sequential relationships between adjacent frames. The model was evaluated using two distinct datasets: a development set of 341 LUS videos and a temporally separated validation set of 56 videos. Through 5-fold cross-validation, LUV-Net demonstrated superior performance in identifying all four LUS patterns (A-lines, B-lines, consolidation, and pleural effusion) compared to conventional video models, achieving higher AUC scores across patterns. The modelś interpretability was validated through visualization of pattern-specific attention regions, providing insights into its decision-making process. The code is publicly available at https://github.com/iamhxxn2/LungUS_Video.}
}



@InProceedings{pmlr-v301-lefkes26a,
  title = 	 {A Balancing Act: Optimizing Classification and Retrieval in Cross-Modal Vision Models},
  author =       {Lefkes, Judith and Grisi, Cl\'ement and Litjens, Geert},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {914--930},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/lefkes26a/lefkes26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/lefkes26a.html},
  abstract = 	 {Despite the promising capabilities of vision-language models (VLMs) in diverse tasks, recent studies reveal that they struggle with the fundamental task of image classification. In this study, we explore leveraging state-of-the-art task-specific classification models as a foundation for VLMs, aiming to preserve strong classification performance. Specifically, we assess the impact of contrastive tuning to enable cross-modal retrieval capabilities on a Vision Transformer (ViT) model trained for multi-label classification on natural images and a Hierarchical Vision Transformer (H-ViT) trained for prostate cancer grading in Whole-Slide Images (WSIs). Our results demonstrate that contrastive fine-tuning creates a clear trade-off: classification accuracy rapidly deteriorates toward zero as vision-text alignment improves. By balancing task-specific and contrastive objectives in the loss function during fine-tuning, we achieve competitive slide-level retrieval performance while maintaining classification accuracy. Our code is available on https://github.com/DIAGNijmegen/tradeoff_classification_alignment.git.}
}



@InProceedings{pmlr-v301-leger26a,
  title = 	 {Sequence models for continuous cell cycle stage prediction from brightfield images},
  author =       {Leger, Louis-Alexandre and Leonardi, Maxine and Salati, Andrea and Naef, Felix and Weigert, Martin},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {931--952},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/leger26a/leger26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/leger26a.html},
  abstract = 	 {Understanding cell cycle dynamics is crucial for studying biological processes such as growth, development and disease progression. While fluorescent protein reporters like the Fucci system allow live monitoring of cell cycle phases, they require genetic engineering and occupy additional fluorescence channels, limiting broader applicability in complex experiments. In this study, we conduct a comprehensive evaluation of deep learning methods for predicting continuous Fucci signals using non-fluorescence brightfield imaging, a widely available label-free modality. To that end, we generated a large dataset of 1.3 M images of dividing RPE1 cells with full cell cycle trajectories to quantitatively compare the predictive performance of distinct model categories including single time-frame models, causal state space models and bidirectional transformer models. We show that both causal and transformer-based models significantly outperform single- and fixed frame approaches, enabling the prediction of visually imperceptible transitions like G1/S within 1h resolution. Our findings underscore the importance of sequence models for accurate predictions of cell cycle dynamics and highlight their potential for label-free imaging.}
}



@InProceedings{pmlr-v301-lennartz26a,
  title = 	 {Adversarial Perturbations Improve Generalization of Confidence Prediction in Medical Image Segmentation},
  author =       {Lennartz, Jonathan and Schultz, Thomas},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {953--964},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/lennartz26a/lennartz26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/lennartz26a.html},
  abstract = 	 {Trustworthy methods for medical image segmentation should come with a reliable mechanism to estimate the quality of their results. Training a separate component for confidence prediction is relatively fast, and can easily be adapted to different quality metrics. However, the resulting estimates are usually not sufficiently reliable under domain shifts, for example when images are taken with different devices. We introduce a novel adversarial strategy for training confidence predictors for the widely used U-Net architecture that greatly improves such generalization. It is based on creating adversarial image perturbations, aimed at substantially decreasing segmentation quality, via the gradients of the confidence predictor, leading to images outside of the original training distribution. We observe that these perturbations initially have little effect on segmentation quality. However, including them in the training gradually improves the confidence predictorś understanding of what actually affects segmentation quality when moving outside of the training distribution. On two different medical image segmentation tasks, we demonstrate that this strategy substantially improves estimates of volumetric and surface Dice on out-of-distribution images.}
}



@InProceedings{pmlr-v301-leshem26a,
  title = 	 {Mapping Functional Language Areas with non-Functional Brain MRI},
  author =       {Leshem, Omri and Bick, Atira Sara and Kiryati, Nahum and Levin, Netta and Mayer, Arnaldo},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {965--977},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/leshem26a/leshem26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/leshem26a.html},
  abstract = 	 {Mapping eloquent brain areas has become a standard of care in brain surgery. Current imaging-based techniques usually rely on functional MRI (fMRI), which measures neural activity via the blood oxygenation level-dependent signal. fMRI protocols are time-intensive, require active patient collaboration, and involve laborious manual post-processing and expertise, making them difficult to implement in some clinical scenarios. In this research, we propose a fully automated deep neural pipeline for the mapping of Broca and Wernicke functional language areas using multiple non-functional MRI modalities. The proposed method is evaluated on a cohort of 30 drug-resistant epilepsy patients, showing encouraging qualitative and quantitative results and suggesting its potential applicability as an effective and practical tool for neurosurgical planning and navigation. Implementation details can be found in our GitHub.}
}



@InProceedings{pmlr-v301-li26a,
  title = 	 {Causal PETS: Causality-Informed PET Synthesis from Multi-modal Data},
  author =       {Li, Yujia and Li, Han and Zhou, S Kevin},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {978--993},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/li26a/li26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/li26a.html},
  abstract = 	 {The synthesis of medical images is particularly important when certain modality data are difffcult to obtain, for example, Positron emission tomography (PET). PET is crucial for diagnosing and monitoring neurological disorders. However, the availability is limited due to factors such as high costs, radiation exposure risks, and other constraints. In this study, we propose Causal PETS, a novel causality-informed synthesis model for synthesizing PET images from multi-modal data including MRI, demographic information, and cerebrospinal fluid (CSF) biomarkers. Unlike conventional approaches that involve a straightforward conversion from T1 to PET, our model analyzes the causality between different modality data and seamlessly integrates such causality into PET image generation. Through comprehensive evaluations, we demonstrate that our Causal PETS model outperforms existing non-causal methods in terms of image clarity and accuracy, particularly in identifying regions of interest critical for neurological disorders such as Alzheimer’s Disease (AD). This work underscores the importance of causal reasoning in medical image synthesis and highlights the potential of multimodal integration to advance clinical decision making.}
}



@InProceedings{pmlr-v301-li26b,
  title = 	 {PCA-YOLO: A Small Liver Tumor Detection Model with Patch-Contrastive Attention},
  author =       {Li, Xueyang and Xiao, Han and Weng, Zongpeng and Hu, Xinrong and Chen, Danny and Shi, Yiyu},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {994--1007},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/li26b/li26b.pdf},
  url = 	 {https://proceedings.mlr.press/v301/li26b.html},
  abstract = 	 {Liver tumors, as one of the most common malignant tumor types, represent a significant clinical challenge, with the detection of small tumors being particularly problematic. Despite the rapid advances in deep learning (DL) offering significant support in reducing the workload of radiologists, current detection models still struggle with the detection of small tumors. This is particularly troubling as these are the cases where even experienced radiologists are more prone to errors, underscoring the critical need for improved accuracy of detection methods in this area. Addressing this critical gap, this article introduces patch-contrastive attention YOLO (PCA-YOLO), an innovative adaptation of the YOLO framework, incorporating a patch-based attention module to specifically target the detection of small liver tumors. Furthermore, we collected a specialized CT dataset focusing exclusively on small liver tumors, complemented with meticulously annotated bounding boxes, to facilitate this study. Our experimental findings demonstrate that our approach achieves a leading mean Average Precision (mAP) score of 77.2% at a 50% Intersection Over Union (IoU) threshold, surpassing all current leading detection methods tested against our specialized dataset.}
}



@InProceedings{pmlr-v301-li26c,
  title = 	 {Contrastive Patient-level Pretraining Enables Longitudinal and Multimodal Fusion for Lung Cancer Risk Prediction},
  author =       {Li, Thomas and Zuo, Lianrui and Liu, Yihao and Krishnan, Aravind and Sandler, Kim L. and Lasko, Thomas A and Maldonado, Fabien and Landman, Bennett Allan},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1008--1020},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/li26c/li26c.pdf},
  url = 	 {https://proceedings.mlr.press/v301/li26c.html},
  abstract = 	 {Leveraging longitudinal and multimodal data is important for clinical predictive tasks. Contrastive language-image pretraining (CLIP) has been successful in learning multimodal representations by aligning paired images and captions, i.e. medical images and corresponding radiology report. However, in real clinical settings, the alignment of unpaired modalities, such as medical images and clinical notes collected at different times, is an open challenge, even though such data are ubiquitous in practice. This study conducts contrastive pretraining between longitudinal chest CTs and clinical variables on the patient level using a large public lung cancer screening dataset. Leveraging a time-distanced transformer to encode longitudinal imaging and an open-source text embedding to encode clinical variables, we optimize contrastive loss between the embedded modalities from same patient (positive pair) against those from different patients (negative pair). We find that finetuning the CLIP representation significantly improves prediction of lung cancer risk in two types of clinical populations (0.895 and 0.893 AUC) compared to conventional multimodal fusion (0.873 and 0.875 AUC) and single modality baselines. These results demonstrate how contrastive patient-level pretraining can enable longitudinal and multimodal fusion without additional training data. We released our code and pre-trained weights at https://github.com/MASILab/lung-cplp.}
}



@InProceedings{pmlr-v301-li26d,
  title = 	 {Scaling Supervision for Free: Leveraging Universal Segmentation Models for Enhanced Medical Image Diagnosis},
  author =       {Li, Yingtai and Ming, Shuai and Lai, Haoran and Tang, Fenghe and Wei, Wei and Zhou, S Kevin},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1021--1040},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/li26d/li26d.pdf},
  url = 	 {https://proceedings.mlr.press/v301/li26d.html},
  abstract = 	 {Deep learning-based medical image analysis has been constrained by the limited availability of large-scale annotated data. While recent advances in large language models have enabled scaling automatic extraction of diagnostic labels from reports, we propose that scaling other form of supervision could be an equally important yet unexplored direction. Inspired by the success of foundation models, we leverage modern universal segmentation model to scale anatomical segmentation as an additional supervision signal during training. Through extensive experiments on three large-scale CT datasets totaling 58K+ volumes, we demonstrate that incorporating this free\"{anatomical} supervision consistently improves the performance of various mainstream architectures (ResNet, ViT, and Swin Transformer) by up to 12.74%, with particularly significant gains for Transformer-based models and anatomically-localized abnormalities, while maintaining inference efficiency as the segmentation branch is only used during training. This work opens up new direction for scaling in medical imaging and demonstrates how existing universal segmentation models can be repurposed to enhance diagnostic models at virtually no additional cost.}
}



@InProceedings{pmlr-v301-liu26a,
  title = 	 {Histopathology Image Report Generation by Vision Language Model with Multimodal In-Context Learning},
  author =       {Liu, Shih-Wen and Fan, Hsuan-Yu and Chu, Wei-Ta and Yang, Fu-En and Wang, Yu-Chiang Frank},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1041--1052},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/liu26a/liu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/liu26a.html},
  abstract = 	 {Automating medical report generation from histopathology images is a critical challenge requiring effective visual representations and domain-specific knowledge. Inspired by the common practices of human experts, we propose an in-context learning framework called PathGenIC that integrates context derived from the training set with a multimodal in-context learning (ICL) mechanism. Our method dynamically retrieves semantically similar whole slide image (WSI)-report pairs and incorporates adaptive feedback to enhance contextual relevance and generation quality. Evaluated on the HistGen benchmark, the framework achieves state-of-the-art results, with significant improvements across BLEU, METEOR, and ROUGE-L metrics, and demonstrates robustness across diverse report lengths and disease categories. By maximizing training data utility and bridging vision and language with ICL, our work offers a solution for AI-driven histopathology reporting, setting a strong foundation for future advancements in multimodal clinical applications.}
}



@InProceedings{pmlr-v301-magg26a,
  title = 	 {Zero-shot capability of 2D SAM-family models for bone segmentation in CT scans},
  author =       {Magg, Caroline and S\'anchez, Clara I. and Kervadec, Hoel},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1053--1073},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/magg26a/magg26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/magg26a.html},
  abstract = 	 {The Segment Anything Model (Sam) and similar models build a family of promptable foundation models (FMs) for image and video segmentation. The object of interest is identified using prompts—user provided input such as bounding boxes or points—and the models have shown very promising results when it comes to generalization to new tasks.However, extensive evaluation studies are required for medical applications, to assess their strengths and weaknesses in clinical settings.As the performance of those models is highly dependent on the quality and quantity of their prompts, it is necessary to thoroughly benchmark the different options. Currently, no dedicated evaluation studies exist specifically for bone segmentation in CT scans. Leveraging high-quality private and public datasets on four skeletal regions, we test the zero-shot capabilities of SAM-family models for bone CT segmentation, using non-interactive prompting strategies, composed of bounding box, points and combinations of the two. Additionally, we design a guideline for informed decision-making in 2D non-interactive prompting based on our insights on segmentation performance and inference time.Our results show that SAM and SAM2 currently outperform medically fine-tuned FMs, and prompted with a bounding box together with a center point have the best performance across all tested settings. Our code is available in this github repository (https://github.com/CarolineMagg/SAM-family-2D-benchmark).}
}



@InProceedings{pmlr-v301-al-mahrooqi26a,
  title = 	 {Empirical Analysis of Scaling Vision Foundation Models for Chest X-rays},
  author =       {Al Mahrooqi, Ahmed and Munjal, Prateek and Rajan, Ronnie and Pimentel, Marco AF and Kanithi, Praveenkumar},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1074--1094},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/al-mahrooqi26a/al-mahrooqi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/al-mahrooqi26a.html},
  abstract = 	 {Recent advancements in multimodal transformers have shown remarkable success in computer vision and natural language tasks, yet their adaptation to the clinical world remains challenging. We introduce CXformer, a vision transformer adapted for chest X-ray analysis, through systematic investigation of architectural choices and training modifications from DINOv2. Our empirical results show that using registers in ViT training, centering the teacher modelś softmax outputs, and optimizing the number of heads leads to better performance. The small version of CXformer(S) (22M parameters) achieves 83.28% mean AUROC on CheXpert test set, surpassing the baseline of 80.46% achieved with vanilla DINOv2 settings. Contrary to common assumptions, our larger model CXformer(B) with 87M parameters shows similar performance at 84% mean AUROC on CheXpert, suggesting that training optimizations matter more than model size. Furthermore compared to the current state-of-the-art RAD-DINO, our CXformer(B), with 46% reduced pretraining compute (in FLOPs) achieves an average AUROC of 87.93% (vs 87.32% by RAD-DINO) on pathology image classification task evaluated across three widely used CXR datasets i.e. CheXpert, RSNA Pneumonia, and NIH CXR8. Beyond classification, CXformer also delivers competitive, and occasionally superior, performance in semantic segmentation and radiology report generation, underscoring its versatility. CXformer base and small models can be found at https://huggingface.co/m42-health}
}



@InProceedings{pmlr-v301-marhuenda26a,
  title = 	 {Unveiling Differences: A Vision Encoder-Decoder Model for Difference Medical Visual Question Answering},
  author =       {Marhuenda, Luis-Jesus and Obrador-Reina, Miquel and Aas-Alas, Mohamed and Albiol, Alberto and Paredes, Roberto},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1095--1106},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/marhuenda26a/marhuenda26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/marhuenda26a.html},
  abstract = 	 {Difference Medical Visual Question Answering (Diff-VQA), a specialized subfield of Medical VQA, tackles the critical task of identifying and describing differences between pairs of medical images. This study introduces a novel Vision Encoder-Decoder (VED) architecture tailored for this task, focusing on the comparison of chest X-ray images to detect and explain changes. The proposed model incorporates two key innovations: (1) a light-weight Transformer text decoder architecture capable of generating precise and contextually relevant answers to complex medical questions, and (2) an enhanced fusion mechanism that improves the model’s ability to distinguish between two input images, enabling more accurate comparison of radiological findings. Our approach excels in identifying significant changes, such as pneumonia and lung opacity, demonstrating its utility in automating preliminary radiological assessments. By leveraging large-scale, domain-specific datasets and employing advanced training strategies, our VED architecture achieves state-of-the-art performance on standard VQA metrics, setting a new benchmark in diagnostic accuracy. These advancements highlight the potential of Diff-VQA to enhance clinical workflows and support radiologists in making more precise, informed decisions.}
}



@InProceedings{pmlr-v301-mazher26a,
  title = 	 {Advancing Medical Image Segmentation with Self-Supervised Learning: A 3D Student-Teacher Approach for Cardiac and Neurological Imaging},
  author =       {Mazher, Moona and Alexander, Daniel C. and Qayyum, Abdul and Niederer, Steven A},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1107--1126},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/mazher26a/mazher26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/mazher26a.html},
  abstract = 	 {We propose 3D-SegSync, a self-supervised learning (SSL) framework designed to improve segmentation accuracy for both cardiac and neurological structures. It integrates a student-teacher model with a 3D Vision-LSTM (xLSTM) backbone to capture spatial dependencies in volumetric data. The SSL phase utilizes large-scale unlabeled datasets for pretraining, followed by fine-tuning on labeled data to improve segmentation across CT and MRI scans. Experimental results demonstrate that 3D-SegSync achieves consistent performance across different anatomical structures. Additionally, its ability to generalize between CT and MRI without requiring modality-specific modifications highlights its adaptability for cardiac and neurological image segmentation. Given its strong performance, 3D-SegSync has the potential to be extended to other medical image segmentation tasks in the future. Code can be found here: https://github.com/Moona-Mazher/3D-SegSync_SSL.}
}



@InProceedings{pmlr-v301-mohammed26a,
  title = 	 {CountXplain: Interpretable Cell Counting with Prototype-Based Density Map Estimation},
  author =       {Mohammed, Abdurahman Ali and Tavanapong, Wallapak and Fonder, Catherine and Sakaguchi, Donald},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1127--1144},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/mohammed26a/mohammed26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/mohammed26a.html},
  abstract = 	 {Cell counting in biomedical imaging is pivotal for various clinical applications, yet the interpretability of deep learning models in this domain remains a significant challenge. We propose a novel prototype-based method for interpretable cell counting via density map estimation. Our approach integrates a prototype layer into the density estimation network, enabling the model to learn representative visual patterns for both cells and background artifacts. The learned prototypes were evaluated through a survey of biologists, who confirmed the relevance of the visual patterns identified, further validating the interpretability of the model. By generating interpretations that highlight regions in the input image most similar to each prototype, our method offers a clear understanding of how the model identifies and counts cells. Extensive experiments on two public datasets demonstrate that our method achieves interpretability without compromising counting effectiveness. This work provides researchers and clinicians with a transparent and reliable tool for cell counting, potentially increasing trust and accelerating the adoption of deep learning in critical biomedical applications. Code is available at https://github.com/NRT-D4/CountXplain.}
}



@InProceedings{pmlr-v301-monzon26a,
  title = 	 {Enhancing Low Back Pain Assessment with Diffusion Models for Lumbar Spine MRI Segmentation},
  author =       {Monzon, Maria and Iff, Thomas and Konukoglu, Ender and Jutzeler, Catherine R},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1145--1163},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/monzon26a/monzon26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/monzon26a.html},
  abstract = 	 {This study introduces a diffusion-based framework for robust and accurate semantic segmentation of lumbar spine MRI scans from patients with low back pain (LBP), regardless of whether the scans are T1- or T2-weighted.We compared with advanced models for segmenting vertebrae, intervertebral discs (IVDs), and spinal canal using the SPIDER dataset. The results showed that SpineSegDiff achieved a segmentation performance comparable to that of the state-of-the-art non-diffusion nnUnet, particularly in improving the identification of degenerated IVDs. In addition, the uncertainty maps generated by our model provide valuable insights for clinical review, enhancing the robustness and reliability of the segmentation results. The potential of diffusion models to enhance the diagnosis and management of LBP through more precise analysis of pathological spine MRI is underscored by our findings.}
}



@InProceedings{pmlr-v301-morao26a,
  title = 	 {Data Augmentation for Medical Imaging: Counterfactual Simulation of Acquisition Parameters via Conditional Diffusion Model},
  author =       {Mor\~{a}o, Pedro A. and Forghani, Yasna and Lou\c{c}\~{a}, Nuno and Gouveia, Pedro and Figueiredo, Mario A. T. and Santinha, Joao},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1164--1180},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/morao26a/morao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/morao26a.html},
  abstract = 	 {Deep learning (DL) models in medical imaging face challenges in generalizability and robustness due to variations in image acquisition parameters (IAP). In this work, we introduce a novel method using conditional denoising diffusion generative models (cDDGMs) to generate counterfactual medical images that simulate different IAP without altering patient anatomy. We demonstrate that using these counterfactual images for magnetic resonance (MR) data augmentation can improve segmentation accuracy in out-of-distribution settings, enhancing the overall generalizability and robustness of DL models across diverse imaging conditions. Our approach shows promise in addressing domain and covariate shifts in medical imaging. The code is publicly available at https://github.com/pedromorao/Counterfactual-MRI-Data-Augmentation}
}



@InProceedings{pmlr-v301-mostafa26a,
  title = 	 {Surgical Flow Masked Autoencoder for Event Recognition},
  author =       {Mostafa, Mayar Lotfy and Alperovich, Anna and Fedotov, Dmitrii and Ghazaei, Ghazal and Saur, Stefan and Farshad, Azade and Navab, Nassir},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1181--1195},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/mostafa26a/mostafa26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/mostafa26a.html},
  abstract = 	 {Recognition and forecasting of surgical events from video sequences are crucial for advancing computer-assisted surgery. Surgical events are often characterized by specific tool-tissue interactions; for example, b̈leeding damage\"{occurs} when a tool unintentionally cuts a tissue, leading to blood flow. Despite progress in general event classification, recognizing and forecasting events in medical contexts remains challenging due to data scarcity and the complexity of these events. To address these challenges, we propose a method utilizing video masked autoencoders (VideoMAE) for surgical event recognition. This approach focuses the network on the most informative areas of the video while minimizing the need for extensive annotations. We introduce a novel mask sampling technique based on an estimated prior probability map derived from optical flow. We hypothesize that leveraging prior knowledge of tool-tissue interactions will enable the network to concentrate on the most relevant regions in the video.We propose two methods for estimating the prior probability map: (a) retaining areas with the fastest motion and (b) incorporating an additional encoding pathway for optical flow. Our extensive experiments on the public dataset CATARACTS and our in-house neurosurgical data demonstrate that optical flow-based masking consistently outperforms random masking strategies of VideoMAE in phase and event classification tasks. We find that an optical flow encoder enhances classification accuracy by directing the networkś focus to the most relevant information, even in regions without rapid motion.Finally, we investigate sequential and multi-task training strategies to identify the best-performing model, which surpasses the current state-of-the-art by 5% on the CATARACTS dataset and 27% on our in-house neurosurgical data.}
}



@InProceedings{pmlr-v301-mouadden26a,
  title = 	 {Biologically-Constrained Multi-Label Classification with Learnable Domain Knowledge},
  author =       {Mouadden, Nabil and Verg\'e, V\'eronique and Arbab, Ahmadreza and Micol, Jean-Baptiste and Bernard, Elsa and Renneville, Aline and Christodoulidis, Stergios and Vakalopoulou, Maria},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1196--1212},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/mouadden26a/mouadden26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/mouadden26a.html},
  abstract = 	 {Although recent foundation models trained in a self-supervised setting have shown promise in cellular image analysis, they often produce biologically impossible predictions when handling multiple concurrent abnormalities. This is a problem, as the biological information that may be needed for the different clinical-oriented problems is not directly presented in the images. In this study, we present a novel and modular approach to enforce biological constraints in multi-label medical imaging classification. Building on the powerful and rich representations of the DinoBloom hematological foundation model, our method combines learnable constraint matrices with adaptive thresholding, effectively preventing contradictory predictions while maintaining high sensitivity.Extensive experiments on three datasets, two public and one in-house on neutrophil classification, demonstrate significant improvements over different foundation models and the state-of-the-art methods. Through detailed ablation studies and hyperparameter interpretation, we show that our approach successfully captures biological relationships between different abnormalities.}
}



@InProceedings{pmlr-v301-nguyen26a,
  title = 	 {Semi-Supervised Skin Lesion Segmentation under Dual Mask Ensemble with Feature Discrepancy Co-Training},
  author =       {Nguyen, Thanh-Huy and Nguyen, Thien and Nguyen, Xuan Bach and Vu, Nguyen Lan Vi and Dinh, Vinh Quang and Meriaudeau, Fabrice},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1213--1226},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/nguyen26a/nguyen26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/nguyen26a.html},
  abstract = 	 {Skin Lesion Segmentation with supportive Deep Learning has become essential in skin lesion analysis and skin cancer diagnosis. However, in the practical scenario of clinical implementation, there is a limitation in human-annotated labels for training data, which leads to poor performance in supervised training models. In this paper, we propose Dual Mask Ensemble (DME) based on a dual-branch co-training network, which aims to enforce two models to exploit information from different views. Specifically, we introduce a novel feature discrepancy loss trained with a cross-pseudo supervision strategy, which enhances model representation by encouraging the sub-networks to learn from distinct features, thereby mitigating feature collapse. Additionally, Dual Mask Ensemble training enables the sub-models to extract more meaningful information from unlabeled data by combining mask predictions. Experimental results demonstrate the effectiveness of our approach, achieving state-of-the-art performance across several metrics (Dice and Jaccard) on the ISIC2018 and HAM10000 datasets. Our code is available at https://github.com/antares0811/DME-FD.}
}



@InProceedings{pmlr-v301-nizhar26a,
  title = 	 {Clinical Measurements with Calibrated Instance-Dependent Confidence Interval},
  author =       {Nizhar, Rotem and Frenkel, Lior and Goldberger, Jacob},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1227--1237},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/nizhar26a/nizhar26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/nizhar26a.html},
  abstract = 	 {Reporting meaningful confidence intervals for the predictions of a regression neural network is critical in medical imaging applications since clinical decisions are based on network predictions. We expect to obtain larger intervals for difficult examples and smaller ones for easier examples to predict. A recently proposed calibration procedure suggests predicting the mean and the variance and scaling the variance on a validation set. Another calibration approach is based on applying conformal prediction to quantile regression. We show that assuming a Gaussian distribution to predict the variance followed by a non-parametric Conformal Prediction technique to scale the estimated variance is the most effective way of achieving a small confidence interval with a coverage guarantee. We report extensive experimental results on various medical imaging datasets and network architectures.}
}



@InProceedings{pmlr-v301-nutzel26a,
  title = 	 {Generate to Ground: Multimodal Text Conditioning Boosts Phrase Grounding in Medical Vision-Language Models},
  author =       {N\"utzel, Felix and Dombrowski, Mischa and Kainz, Bernhard},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1238--1257},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/nutzel26a/nutzel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/nutzel26a.html},
  abstract = 	 {Phrase grounding, *i.e.*, mapping natural language phrases to specific image regions, holds significant potential for disease localization in medical imaging through clinical reports. While current state-of-the-art methods rely on discriminative, self-supervised contrastive models, we demonstrate that generative text-to-image diffusion models, leveraging cross-attention maps, can achieve superior zero-shot phrase grounding performance. Contrary to prior assumptions, we show that fine-tuning diffusion models with a frozen, domain-specific language model, such as CXR-BERT, substantially outperforms domain-agnostic counterparts. This setup achieves remarkable improvements, with mIoU scores doubling those of current discriminative methods. These findings highlight the underexplored potential of generative models for phrase grounding tasks. To further enhance performance, we introduce Bimodal Bias Merging (BBM), a novel post-processing technique that aligns text and image biases to identify regions of high certainty. BBM refines cross-attention maps, achieving even greater localization accuracy. Our results establish generative approaches as a more effective paradigm for phrase grounding in the medical imaging domain, paving the way for more robust and interpretable applications in clinical practice. The source code and model weights are available at https://github.com/Felix-012/generate_to_ground.}
}



@InProceedings{pmlr-v301-orbes26a,
  title = 	 {MAIS: Memory-Attention for Interactive Segmentation},
  author =       {Orbes, Mauricio and Lucena, Oeslle and Ourselin, Sebastien and Cardoso, M. Jorge},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1258--1272},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/orbes26a/orbes26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/orbes26a.html},
  abstract = 	 {Interactive medical segmentation reduces annotation effort by refining predictions through user feedback. Vision Transformer (ViT)-based models, such as the Segment Anything Model (SAM), achieve state-of-the-art performance using user clicks and prior masks as prompts. However, existing methods treat interactions as independent events, leading to redundant corrections and limited refinement gains. We address this by introducing MAIS, a Memory-Attention mechanism for Interactive Segmentation that stores past user inputs and segmentation states, enabling temporal context integration. Our approach enhances ViT-based segmentation across diverse imaging modalities, achieving more efficient and accurate refinements.}
}



@InProceedings{pmlr-v301-ouyang26a,
  title = 	 {DeFusion: An Effective Decoupling Fusion Network for Multi-Modal Pregnancy Prediction},
  author =       {Ouyang, Xueqiang and Wei, Jia and Huo, Wenjie and Wang, Xiaocong and Li, Rui and Zhou, Jianlong},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1273--1293},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/ouyang26a/ouyang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/ouyang26a.html},
  abstract = 	 {Temporal embryo images and parental fertility table indicators are both valuable for pregnancy prediction in in vitro fertilization embryo transfer (IVF-ET). However, current machine learning models cannot make full use of the complementary information between the two modalities to improve pregnancy prediction performance. In this paper, we propose a Decoupling Fusion Network called DeFusion to effectively integrate the multi-modal information for IVF-ET pregnancy prediction. Specifically, we propose a decoupling fusion module that decouples the information from the different modalities into related and unrelated information, thereby achieving a more delicate fusion. And we fuse temporal embryo images with a spatial-temporal position encoding, and extract fertility table indicator information with a table transformer. To evaluate the effectiveness of our model, we use a new dataset including 4046 cases collected from Southern Medical University. The experiments show that our model outperforms state-of-the-art methods. Meanwhile, the performance on the eye disease prediction dataset reflects the modelś good generalization. Our code and dataset are available at https://github.com/Ou-Young-1999/DFNet.}
}



@InProceedings{pmlr-v301-pellegrini26a,
  title = 	 {RaDialog: Large Vision-Language Models for X-Ray Reporting and Dialog-Driven Assistance},
  author =       {Pellegrini, Chantal and \"Ozsoy, Ege and Busam, Benjamin and Wiestler, Benedikt and Navab, Nassir and Keicher, Matthias},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1294--1312},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/pellegrini26a/pellegrini26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/pellegrini26a.html},
  abstract = 	 {Conversational AI tools for generating and discussing accurate radiology reports could transform radiology by enabling collaborative, human-in-the-loop diagnostic processes, saving time and enhancing report quality. While, to this end, Large Vision-Language Models hold promise, current methods lack clinical correctness or are single-task models without conversational abilities. We propose a novel architecture and dataset to address these limitations. First, we propose a secondary image branch, explicitly focusing on structured clinical findings, improving the clinical correctness score by 13.3%. Second, we propose a catastrophic forgetting mitigation strategy and instruct dataset with variable dialog-based tasks, to enable our model to handle a multitude of different queries. RaDialog marks a foundational step toward clinical dialog systems, outperforming existing medical LVLMs by 15.0% in clinical correctness in report generation, 23.4% in interactive report correction, and is preferred by radiologists in 84.0% of cases over a comparative method. Our model and dataset are publicly available (https://github.com/ChantalMP/RaDialog and https://physionet.org/content/radialog-instruct-dataset/1.1.0/).}
}



@InProceedings{pmlr-v301-di-piazza26a,
  title = 	 {Imitating Radiological Scrolling: A Glocal-Lobal Attention Model for 3D Chest CT Volumes Multi-Label Anomaly Classification},
  author =       {Di Piazza, Th\'eo and Lazarus, Carole and Nempont, Olivier and Boussel, Loic},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1313--1325},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/di-piazza26a/di-piazza26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/di-piazza26a.html},
  abstract = 	 {The rapid increase in the number of Computed Tomography (CT) scan examinations has created an urgent need for automated tools, such as organ segmentation, anomaly classification, and report generation, to assist radiologists with their growing workload.  Multi-label classification of Three-Dimensional (3D) CT scans is a challenging task due to the volumetric nature of the data and the variety of anomalies to be detected. Existing deep learning methods based on Convolutional Neural Networks (CNNs) struggle to capture long-range dependencies effectively, while Vision Transformers require extensive pre-training, posing challenges for practical use. Additionally, these existing methods do not explicitly model the radiologistś navigational behavior while scrolling through CT scan slices, which requires both global context understanding and local detail awareness. In this study, we present CT-Scroll, a novel global-local attention model specifically designed to emulate the scrolling behavior of radiologists during the analysis of 3D CT scans. Our approach is evaluated on two public datasets, demonstrating its efficacy through comprehensive experiments and an ablation study that highlights the contribution of each model component.}
}



@InProceedings{pmlr-v301-poudel26a,
  title = 	 {SRMRI: A Diffusion-Based Super-resolution Framework and Open Dataset for Blind MRI Super-Resolution},
  author =       {Poudel, Arpan and Shrestha, Mamata and Wang, Nian and Nakarmi, Ukash},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1326--1341},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/poudel26a/poudel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/poudel26a.html},
  abstract = 	 {Existing deep learning methods for medical image super-resolution (SR) often rely on paired datasets generated by simulating low-resolution (LR) images from corresponding high-resolution (HR) scans, which can introduce biases and degrade real-world performance. To overcome these limitations, we present an unsupervised approach based on a score-based diffusion model that does not require paired training data. We train a score-based diffusion model using denoising score matching on HR Magnetic Resonance Imaging (MRI) scans, then perform iterative refinement with a stochastic differential equation (SDE) solver while enforcing data consistency from LR scans. Our method provides faster sampling compared to existing generative approaches and achieves competitive results on key metrics, though it does not surpass fully supervised baselines in PSNR and SSIM. Notably, while supervised models often report higher numerical metrics, we observe that they can produce suboptimal reconstructions due to their reliance on fixed upscaling kernels. Finally, we introduce the SRMRI dataset, containing LR and HR images obtained from scanner for training and evaluating MR image super-resolution models. Code and dataset are available at: https://github.com/arpanpoudel/SRMRI}
}



@InProceedings{pmlr-v301-qahqaie26a,
  title = 	 {Intelligent Lesion Selection: A Novel Method for Longitudinal Assessment of Breast Cancer Lung Metastases},
  author =       {Qahqaie, Melika and Zimmer, Veronika A and Castaneda, Eduardo and Peltonen, Katariina and Laaksolilj, Joonas and L\"ahteenmaa, Juho and Heimann, Tobias and Maier, Andreas and Neumann, Dominik},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1342--1355},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/qahqaie26a/qahqaie26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/qahqaie26a.html},
  abstract = 	 {Breast cancer, the second most common cancer globally, often metastasizes to the lungs, requiring frequent computed tomography (CT) scans to monitor disease progression. Manual analysis by radiologists is time-consuming and prone to variability, underscoring the need for automated systems to enhance accuracy and efficiency. The goal of such systems is to optimize processes like RECIST score calculation for tumor response assessment. This study presents a pipeline for the automated temporal analysis of breast cancer lung metastases. Existing lung nodule detection and segmentation models were adapted for detecting and segmenting breast cancer metastases. Registration-based lesion tracking was incorporated, and a novel Temporal Lesion Pair Classifier was developed to identify significant lesions and estimate tumor load evolution by summing their diameters, following an adaptation of the RECIST guidelines. Evaluated on a unique dataset of breast cancer patients, each with multiple annotated CT scans at different disease stages, the proposed pipeline demonstrated a 42% reduction in median tumor size progression discrepancy for consecutive study pairs and improved tumor response classification accuracy by 22% at the patient level.}
}



@InProceedings{pmlr-v301-qi26a,
  title = 	 {Style-Aligned Image Composition for Robust Detection of Abnormal Cells in Cytopathology},
  author =       {Qi, Qiuyi and Li, Xin and Kong, Ming and Xu, Zikang and Chen, Bingdi and Zhu, Qiang and Zhou, S Kevin},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1356--1371},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/qi26a/qi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/qi26a.html},
  abstract = 	 {Challenges such as the lack of high-quality annotations, long-tailed data distributions, and inconsistent staining styles pose significant obstacles to training neural networks to detect abnormal cells in cytopathology robustly. This paper proposes a style-aligned image composition (SAIC) method that composes high-fidelity and style-preserved pathological images to enhance the effectiveness and robustness of detection models. Without additional training, SAIC first selects an appropriate candidate from the abnormal cell bank based on attribute guidance. Then, it employs a high-frequency feature reconstruction to achieve a style-aligned and high-fidelity composition of abnormal cells and pathological backgrounds. Finally, it introduces a large vision-language model to filter high-quality synthesis images. Experimental results demonstrate that incorporating SAIC-synthesized images effectively enhances the performance and robustness of abnormal cell detection for tail categories and styles, thereby improving overall detection performance. The comprehensive quality evaluation further confirms the generalizability and practicality of SAIC in clinical application scenarios. Our code will be released at https://github.com/Joey-Qi/SAIC.}
}



@InProceedings{pmlr-v301-qin26a,
  title = 	 {Real-time Breast Lesion Detection in Videos via Spatial-temporal Feature Aggregation},
  author =       {Qin, Chao and Cao, Jiale and Khan, Fahad Shahbaz and Khan, Salman and Fu, Huazhu and Ahissar, Ehud and Anwer, Rao Muhammad},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1372--1383},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/qin26a/qin26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/qin26a.html},
  abstract = 	 {Recently, transformer-based detectors have shown impressiveperformance for breast lesion detection in ultrasound videos. However,these methods often require substantial computational resource and ex-hibit low inference speed, which poses challenges towards real-time ap-plicability. To address this issue, we introduce a fast yet accurate spatial-temporal transformer, named FA-DETR, to efficiently aggregate multi-scale spatial-temporal features for breast lesion detection in ultrasoundvideos. Our FA-DETR is based on a lightweight spatial-temporal self-attention module, which seamlessly fuses spatial and temporal featuresextracted from each video frame. In the decoding phase, we employ IoU-aware query selection to generate independent queries for each frame.These queries gain access to rich spatial-temporal information throughthe encoder embeddings’ cross-attention and frame-aware cross-attentionmechanisms. Experiments conducted on a public breast lesion ultrasoundvideo dataset demonstrate that our FA-DETR achieves state-of-the-artperformance with an absolute gain of 3.8% in terms of overall AP whilebeing 2.5 times faster, compared to the best existing approach in theliterature. Our code and models will be publicly released.}
}



@InProceedings{pmlr-v301-rahman26a,
  title = 	 {Training-Free Dataset Pruning for Polyp Segmentation via Community Detection in Similarity Networks},
  author =       {Rahman, Md Mostafijur and Marculescu, Radu},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1384--1402},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/rahman26a/rahman26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/rahman26a.html},
  abstract = 	 {Recent advances in deep learning have been driven by the availability of larger datasets and more complex models; however, this progress comes at the expense of substantial computational and annotation costs. To address these issues, we introduce a new, training-free dataset pruning method, *PRIME*, targeting polyp segmentation in medical imaging. To this end, *PRIME* constructs a similarity network among images in the target dataset and then applies community detection to retain a much smaller, yet representative subset of images from the original dataset. Unlike existing methods that require model training for dataset pruning, our *PRIME* completely avoids model training, thus significantly reducing computational demands. The reduction in the training dataset reduces 56.2% data annotation costs and enables 2.3$\times$ faster training of polyp segmentation models compared to training on the entire annotated dataset, with only a 0.5% drop in the DICE score. Consequently, our *PRIME* enables efficient training, fine-tuning, and domain adaptation across medical centers, thus offering a cost-effective solution for deep learning in polyp segmentation. Our implementation is available at https://github.com/SLDGroup/PRIME.}
}



@InProceedings{pmlr-v301-schneider26a,
  title = 	 {Learning from a Few Shots: Data-efficient Cervical Vertebral Maturation Assessment},
  author =       {Schneider, Helen and Parikh, Aditya and Priya, Priya and Bro{\ss}, Maximilian and Verhofstadt, Tom and Konermann, Anna and Sifa, Rafet},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1403--1417},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/schneider26a/schneider26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/schneider26a.html},
  abstract = 	 {The timing of treatment is a crucial decision in orthodontics. Initiating treatment duringthe appropriate growth phase leads to optimal patient outcomes and can prevent prolongedtreatment durations. The most commonly used method for classifying growth phases iscervical vertebral maturation (CVM) assessment, which categorizes CVM into six stagesbased on the shape and size of the cervical vertebrae. Due to the complexity of manual CVManalysis, it often falls short in performance when assessed visually. Deep learning methodscan assist physicians in classifying CVM stages, thus improving orthodontic workflows andtreatments. However, a significant challenge in deep learning-based CVM assessment isthe limited dataset volume, resulting from difficulties in data collection and annotation.While small training datasets can greatly hinder the model’s generalization performance,research on data-efficient training methods for CVM assessment is still lacking. To the bestof our knowledge, this paper is the first to evaluate the potential of few-shot learning and in-domain transfer learning for CVM assessment. Specifically, we investigate the architecturesResNet18 and MedSam-2D. Few-shot learning enhances classification performance by upto 9%. Additionally, in-domain pre-training (using chest X-ray data) results in a significantperformance increase of up to 4%.}
}



@InProceedings{pmlr-v301-shah26a,
  title = 	 {Federated Class-Heterogeneous Report Labeling with Surgical Aggregation},
  author =       {Shah, Nikhil and Kulkarni, Pranav and Doo, Florence and Li, Ang and Jacobs, Michael A. and Parekh, Vishwa Sanjay},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1418--1429},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/shah26a/shah26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/shah26a.html},
  abstract = 	 {Labeling radiology reports is essential for creating medical imaging datasets and enabling AI-driven clinical decision support. While SBERT-based classifiers offer computationally efficient solutions for this task, a major challenge is the class heterogeneity across datasets, as different groups focus on extracting distinct disease labels. For instance, NIH and CheXpert CXR datasets share only 7 of their 14 and 13 labels, respectively. To address this, we propose to use Surgical Aggregation, a class-heterogeneous federated learning framework that collaboratively trains a global multi-label classifier without requiring alignment of labeling schemes across clients. Surgical Aggregation selectively merges shared class weights while appending new disease-specific nodes, thereby unifying distinct local labeling priorities, to dynamically incorporate all disease labels of interest. We evaluated Surgical Aggregation in multiple simulated settings with varying number of participating nodes as well as different degrees of overlapping labels. Our results demonstrate high performance confirming adaptability in class-heterogeneous environments, thereby offering a scalable and privacy-preserving solution for collaborative medical report labeling. Our code is available at https://github.com/BioIntelligence-Lab/Federated-MedEmbedX}
}



@InProceedings{pmlr-v301-shah26b,
  title = 	 {A Vision Foundation Model for Cataract Surgery Using Joint-Embedding Predictive Architecture},
  author =       {Shah, Nisarg A and Xia, Mingze and Vijay, Subhasri and Sikder, Shameema and Vedula, S. Swaroop and Patel, Vishal M.},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1430--1444},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/shah26b/shah26b.pdf},
  url = 	 {https://proceedings.mlr.press/v301/shah26b.html},
  abstract = 	 {Vision foundation models can automate analysis of surgical videos and enable multiple applications that support patient care and surgical training. For cataract surgery, existing models are limited by reliance on small datasets, privacy concerns, and poor generalizability across surgical settings. In this paper, we introduce JHU-VPT(JEPA), a self-supervised vision foundation model leveraging Joint-Embedding Predictive Architecture (JEPA) to learn spatiotemporal representations via latent feature prediction on a large corpus of unlabeled cataract videos, without requiring extensive labeled datasets or pixel-level reconstruction. JHU-VPT(JEPA) is pretrained on 2591 videos from multiple sites that capture different surgical technique and style variations. Comprehensive evaluations on step recognition, surgical feedback, and skill assessment tasks demonstrate that JHU-VPT(JEPA) outperforms existing methods. JHU-VPT(JEPA)’s effectiveness is evident even when using attentive probing with a frozen encoder, highlighting the robustness of the learned features and addressing privacy concerns by not requiring access to raw videos during downstream tasks. Our approach offers a scalable, generalizable, and privacy-preserving solution for surgical video analysis, with significant potential to advance patient care and surgical education.}
}



@InProceedings{pmlr-v301-shuaibu26a,
  title = 	 {Capturing Longitudinal Changes in Brain Morphology Using Temporally Parameterized Neural Displacement Fields.},
  author =       {Shuaibu, Aisha L. and Gibb, Kieran A. and Wijeratne, Peter A. and Simpson, Ivor J A},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1445--1464},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/shuaibu26a/shuaibu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/shuaibu26a.html},
  abstract = 	 {Longitudinal image registration enables studying temporal changes in brain morphology which is useful in applications where monitoring the growth or atrophy of specific structures is important. However this task is challenging due to; noise/artifacts in the data and quantifying small anatomical changes between sequential scans. We propose a novel longitudinal registration method that models structural changes using temporally parameterized neural displacement fields. Specifically, we implement an implicit neural representation (INR) using a multi-layer perceptron that serves as a continuous coordinate-based approximation of the deformation field at any time point. In effect, for any $N$ scans of a particular subject, our model takes as input a 3D spatial coordinate location $x, y, z$ and a corresponding temporal representation $t$ and learns to describe the continuous morphology of structures for both observed and unobserved points in time. Furthermore, we leverage the analytic derivatives of the INR to derive a new regularization function that enforces monotonic rate of change in the trajectory of the voxels, which is shown to provide more biologically plausible patterns. We demonstrate the effectiveness of our method on 4D brain MR registration. Our code is publicly available  here https://github.com/aisha-lawal/inrmorph}
}



@InProceedings{pmlr-v301-stym-popper26a,
  title = 	 {DAFTED: Decoupled Asymmetric Fusion of Tabular and Echocardiographic Data for Cardiac Hypertension Diagnosis},
  author =       {Stym-Popper, J\'er\'emie and Painchaud, Nathan and Courand, Pierre-Yves and Rambour, Cl\'ement and Thome, Nicolas and Bernard, Olivier},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1465--1482},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/stym-popper26a/stym-popper26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/stym-popper26a.html},
  abstract = 	 {Multimodal data fusion has emerged as a key approach in recent years for enhancing diagnosis and prognosis in many medical applications. With the advent of transformer-based methods, it is now possible to combine information from different modalities that provide complementary insights. However, most existing methods rely on symmetric fusion schemes, assuming equal importance for information carried by each modality—a strong assumption that may not always hold true. In this study, we propose an alternative fusion strategy based on an asymmetric scheme. Starting with a primary modality that offers the most critical information, we integrate secondary modality contributions by disentangling shared and modality-specific information. The proposed model was validated on a dataset of 239 patients for characterizing hypertension severity by fusing time series automatically extracted from echocardiographic image sequences and tabular data from patient records. Results show that our approach outperforms existing unimodal and multimodal approaches, achieving an AUC score over 90% - a crucial benchmark for clinical use.}
}



@InProceedings{pmlr-v301-su26a,
  title = 	 {Mesh-Prompted Anatomy Segmentation},
  author =       {Su, Dingjie and Liu, Yihao and Zuo, Lianrui and Dawant, Benoit},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1483--1494},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/su26a/su26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/su26a.html},
  abstract = 	 {We present a novel technique for segmenting anatomical structures in medical images by using a canonical mesh as a prompt for the structure to be segmented. Unlike point-prompted segmentation methods, such as those based on Segment-Anything Models, mesh prompting reduces the ambiguity associated with point prompts and provides a stronger shape prior, which is particularly advantageous for many medical applications. Our approach performs mesh-prompted segmentation by registering the signed distance function (SDF) of the mesh to the target image using a vector-field attention network trained with boundary-based loss terms. Before registration, the prompted mesh is roughly aligned with the structure in the target image using a center prompt provided by the user. This method allows for independent initialization of each structureś position and the prediction of deformation fields specific to each structure, which offers advantages over segmentation via direct image registration that typically relies on a single deformation field to accommodate all structures. Additionally, it preserves surface correspondence better than image registration using region-based loss terms. We evaluate our method on two CT datasets featuring common ear and body structures. A comparison of our technique with image registration and other state-of-the-art segmentation methods shows that our approach achieves superior segmentation accuracy.}
}



@InProceedings{pmlr-v301-tang26a,
  title = 	 {DuoFormer: Leveraging Hierarchical Visual Representations by Local and Global Attention},
  author =       {Tang, Xiaoya and Zhang, Bodong and Ho, Man M. and Knudsen, Beatrice and Tasdizen, Tolga},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1495--1507},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/tang26a/tang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/tang26a.html},
  abstract = 	 {Despite the widespread adoption of transformers in medical applications, the exploration of multi-scale learning through transformers remains limited, while hierarchical representations are thought to be advantageous for medical diagnosis. We propose a novel hierarchical transformer model that adeptly integrates the feature extraction capabilities of Convolutional Neural Networks (CNNs) with the advanced representational potential of Vision Transformers (ViTs). Addressing the lack of inductive biases and dependence on extensive training datasets in ViTs, our model employs a CNN backbone to generate hierarchical visual representations. These representations are adapted for transformer input through an innovative patch tokenization process, preserving the inherited multi-scale inductive biases. We also introduce a scale-wise attention mechanism that directly captures intra-scale and inter-scale associations. This mechanism complements patch-wise attention by enhancing spatial understanding and preserving global perception, which we refer to as local and global attention, respectively. Our model significantly outperforms baseline models in terms of classification accuracy, demonstrating its efficiency in bridging the gap between Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs). The components are designed as plug-and-play for different CNN architectures and can be adapted for multiple applications. The code is available at \href{https://github.com/xiaoyatang/DuoFormer.git}{https://github.com/xiaoyatang/DuoFormer.git}.}
}



@InProceedings{pmlr-v301-teuber26a,
  title = 	 {Parameter Efficient Fine-Tuning of Segment Anything Model for Biomedical Imaging},
  author =       {Teuber, Carolin and Archit, Anwai and Pape, Constantin},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1508--1549},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/teuber26a/teuber26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/teuber26a.html},
  abstract = 	 {Segmentation is an important analysis task for biomedical images, enabling the study of individual organelles, cells or organs. Deep learning has massively improved segmentation methods, but challenges remain in generalization to new conditions, requiring costly data annotation. Vision foundation models, such as Segment Anything Model (SAM), address this issue through improved generalization. However, these models still require finetuning on annotated data, although with less annotations, to achieve optimal results for new conditions. As a downside, they require more computational resources. This makes parameter-efficient finetuning (PEFT) relevant. We contribute the first comprehensive study of PEFT for SAM applied to biomedical images. We find that the placement of PEFT layers is more important for efficiency than the type of layer for vision transformers and we provide a recipe for resource-efficient finetuning.}
}



@InProceedings{pmlr-v301-tomar26a,
  title = 	 {Effective Disjoint Representational Learning for Anatomical Segmentation},
  author =       {Tomar, Priya and Parikh, Aditya and Feodorovici, Philipp and Arensmeyer, Jan and Matthaei, Hanno and Bauckhage, Christian and Schneider, Helen and Sifa, Rafet},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1550--1567},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/tomar26a/tomar26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/tomar26a.html},
  abstract = 	 {In the wake of the limited availability of pertinent datasets, the application of computer vision methods for semantic segmentation of abdominal structures is mainly constrained to surgical instruments or organ-specific segmentations. Multi-organ segmentation has the potential to furnish supplementary assistance in multifarious domains in healthcare, for instance, robot-assisted laparoscopic surgery. However, in addition to the complexity involved in discriminating anatomical structures due to their visual attributes and operative conditions, the representation bias pertaining to organ size results in poor segmentation performance on organs with smaller pixel proportions. In this work, we focus on alleviating the influence of representation bias by involving different encoder-decoder frameworks for learning organ-specific features. In particular, we investigate the effect of organ-specific decoders on binary segmentation of anatomical structures in abdominal surgery. Additionally, we analyze the effect of organ-specific pretraining on the multi-label segmentation in two model training settings including knowledge sharing and disjoint learning, in relation to the contextual feature sharing between organ-specific decoders. Our results illustrate the significant gain in segmentation performance by incorporating organ-specific decoders, especially for less represented organs.}
}



@InProceedings{pmlr-v301-vaish26a,
  title = 	 {Data-Agnostic Augmentations for Unknown Variations: Out-of-Distribution Generalisation in MRI Segmentation},
  author =       {Vaish, Mei and Meister, Felix and Heimann, Tobias and Brune, Christoph and Wolterink, Jelmer M.},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1568--1596},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/vaish26a/vaish26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/vaish26a.html},
  abstract = 	 {Medical image segmentation models are often trained on curated datasets, leading to performance degradation when deployed in real-world clinical settings due to mismatches between training and test distributions. While data augmentation techniques are widely used to address these challenges, traditional visually consistent augmentation strategies lack the robustness needed for diverse real-world scenarios. In this work, we systematically evaluate alternative augmentation strategies, focusing on MixUp and Auxiliary Fourier Augmentation. These methods mitigate the effects of multiple variations without explicitly targeting specific sources of distribution shifts. We demonstrate how these techniques significantly improve out-of-distribution generalization and robustness to imaging variations across a wide range of transformations in cardiac cine MRI and prostate MRI segmentation. We quantitatively find that these augmentation methods enhance learned feature representations by promoting separability and compactness. Additionally, we highlight how their integration into nnU-Net training pipelines provides an easy-to-implement, effective solution for enhancing the reliability of medical segmentation models in real-world applications. Our code is available at: https://github.com/MIAGroupUT/augmentations-for-the-unknown.}
}



@InProceedings{pmlr-v301-varma26a,
  title = 	 {MedVAE: Efficient Automated Interpretation of Medical Images with Large-Scale Generalizable Autoencoders},
  author =       {Varma, Maya and Kumar, Ashwin and van der Sluijs, Rogier and Ostmeier, Sophie and Blankemeier, Louis and Chambon, Pierre Joseph Marcel and Bluethgen, Christian and Prince, Jip and Langlotz, Curtis and Chaudhari, Akshay S},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1597--1626},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/varma26a/varma26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/varma26a.html},
  abstract = 	 {Medical images are acquired at high resolutions with large fields of view in order to capture fine-grained features necessary for clinical decision-making. Consequently, training deep learning models on medical images can incur large computational costs. In this work, we address the challenge of downsizing medical images in order to improve downstream computational efficiency while preserving clinically-relevant features. We introduce MedVAE, a family of six large-scale 2D and 3D autoencoders capable of encoding medical images as downsized latent representations and decoding latent representations back to high-resolution images. We train MedVAE autoencoders using a novel two-stage training approach with 1,052,730 medical images. Across diverse tasks obtained from 20 medical image datasets, we demonstrate that (1) utilizing MedVAE latent representations in place of high-resolution images when training downstream models can lead to efficiency benefits (up to 70x improvement in throughput) while simultaneously preserving clinically-relevant features and (2) MedVAE can decode latent representations back to high-resolution images with high fidelity. Our work demonstrates that large-scale, generalizable autoencoders can help address critical efficiency challenges in the medical domain.Code: https://github.com/StanfordMIMI/MedVAE}
}



@InProceedings{pmlr-v301-vasylechko26a,
  title = 	 {Enhancing Wrist Fracture Detection through LLM-Powered Data Extraction and Knowledge-Based Ensemble Learning},
  author =       {Vasylechko, Serge Didenko and Tsai, Andy and Afacan, Onur and Kurugol, Sila},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1627--1637},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/vasylechko26a/vasylechko26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/vasylechko26a.html},
  abstract = 	 {The accuracy and generalization of deep learning models for fracture detection and classification in wrist radiographs is often limited by the scarcity of high-quality annotated data and class imbalances. Traditional annotation methods are time-consuming, expensive and prone to inter-observer variability \cite{rajpurkar2017mura}.  To address these challenges, we developed an automated, cost-free approach to extract structured information from radiology reports, such as fracture type, location and severity. Our technique incorporates methods introduced by MedPrompt \cite{nori2023can}, and leverages domain expertise for group based sampling \cite{khan2024knowledge}. Using these structured language labels alongside a pre-trained YOLO v7 backbone \cite{nagy2022pediatric, ciri2023bonefracture}, which initially demonstrated low accuracy scores on our clinical data, we were able to selectively finetune the model in pseudo-blind manner. This approach utilized the extracted language labels without requiring expert annotations for training. We curated a large dataset of almost 3,000 pediatric wrist X-ray images and their corresponding radiology reports. Validation and testing were conducted on a smaller subset of 300 expert-annotated images.Our findings indicate that this pseudo-blind training strategy significantly enhances the base accuracy of the pre-trained model, achieving performance comparable to models fine-tuned with meticulously labeled expert annotations. Specifically, we improved the mean Average Precision (mAP) detection score for true positives related to fractures from 76% to 83%. Additionally, we observed improvements in precision and recall metrics for fracture detection. By integrating prompt-based information extraction with knowledge-based grouping, we achieved a robust and effective model for fracture detection.}
}



@InProceedings{pmlr-v301-vigneault26a,
  title = 	 {Cardiac Computed Tomography Angiography Plane Prediction and Comprehensive LV Segmentation},
  author =       {Vigneault, Davis Marc and Manohar, Ashish and Hernandez, Abraham and Wong, Krista Tin Chi and Kong, Fanwei and Gegenava, Tea and Nieman, Koen and Fleischmann, Dominik},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1638--1652},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/vigneault26a/vigneault26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/vigneault26a.html},
  abstract = 	 {The use of cardiac computed tomography angiography (CCTA) has dramatically increased over the past decade, with an increasingly recognized role for functional assessment; however, reformatting these datasets into standard cardiac planes and performing quantitativeanalysis remains time consuming and disruptive to clinical workflows. Here, we propose a fully automated, volumetric, end-to-end trained network for simultaneous detection of standard cardiac planes and comprehensive left ventricular (LV) segmentation in the predicted short axis coordinate system. The architecture consists of a coarse segmentation module, a transformation module, and a fine segmentation module. The coarse segmentation module provides an initial segmentation of the full field of view (FOV) axial images at low resolution. The transformation module predicts the rotations corresponding to the standard cardiac planes (short axis, SAX; two chamber, 2CH; three chamber, 3CH; and four chamber, 4CH) and reformats the source volume into the predicted SAX coordinate system at high resolution. Finally, the fine segmentation module segments the narrow FOV, high resolution SAX volume. The dataset consisted of 313 CCTA studies partitioned into training, validation, and testing in an 80:10:10 split. Architectural decisions are justified using ablation experiments. On the test set, the proposed architecture achieved accurate plane predictions (mean angle errors of $9.1\pm6.2^\circ$, $9.5\pm5.4^\circ$, $9.0\pm5.9^\circ$, and $8.8\pm5.9^\circ$ for the SAX, 2CH, 3CH, and 4CH planes, respectively) and high quality segmentations (Dice scores of $0.955\pm0.008$, $0.928\pm0.016$, and $0.808\pm0.029$ for the bloodpool, myocardium, and trabeculations, respectively). This fully automated pipeline has the potential to replace current manual workflows, expediting the availability of standard cardiac planes and quantitative analysis for clinical interpretation.}
}



@InProceedings{pmlr-v301-vigneshwaran26a,
  title = 	 {Evaluating Shortcut Utilization in Deep Learning Disease Classification through Counterfactual Analysis},
  author =       {Vigneshwaran, Vibujithan and Stanley, Emma A.M. and Souza, Raissa and Ohara, Erik and Wilms, Matthias and Forkert, Nils},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1653--1668},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/vigneshwaran26a/vigneshwaran26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/vigneshwaran26a.html},
  abstract = 	 {Although deep learning models can surpass human performance in many medical image analysis tasks, they remain vulnerable to algorithmic shortcuts, where spurious correlations in the data are exploited, which may lead to reduced trust in their predictions/classifications. This issue is especially concerning when models rely on protected attributes (e.g., sex, race, or site) as shortcuts. Such shortcut reliance not only impairs their ability to generalize to unseen datasets but also raises fairness concerns, ultimately undermining their purpose for computer-aided diagnosis. Previous techniques for analyzing protected attributes, such as supervised prediction layer information tests, only highlight the presence of protected attributes in the feature space but do not confirm their role in solving the primary task. Determining the impact of protected attributes as shortcuts is particularly challenging, as it requires knowing how a model would perform without those attributes — a counterfactual scenario typically unattaiw:nable in real-world data. As a workaround, researchers have addressed the absence of counterfactuals by generating synthetic datasets with and without protected attributes. In this study, we propose a novel approach to evaluate real-world datasets and determine the extent to which each protected attribute is used as a shortcut in a classification task. Therefore, we define and train a causal generative model to produce causally-grounded counterfactuals, removing protected attributes from activations and allowing us to measure their impact on model performance. Employing T1-weighted MRI data from 9 sites (835 subjects: 426 with Parkinson’s disease (PD) and 409 healthy), we demonstrate that counterfactually removing the śite\’{attribute} from the penultimate layer of a trained classification model reduced the AUROC for PD classification from 0.74 to 0.65, indicating a 9% performance improvement achieved by using śite\’{as} a shortcut. In contrast, counterfactually removing the śex\’{attribute} had minimal impact on performance, with only a slight change of 0.004, indicating that śex\’{was} not utilized as a shortcut by the classification model. The proposed method offers a robust framework for assessing shortcut utilization in medical image classification, paving the way for improved bias detection and mitigation in medical imaging tasks. The code for this work is available on https://github.com/vibujithan/shortcut-analysis.}
}



@InProceedings{pmlr-v301-vries26a,
  title = 	 {Neural fields for tissue attenuation curve reconstruction in sparsely sampled time-resolved CT},
  author =       {de Vries, Lucas and van Herten, Rudolf Leonardus Mirjam and van der Sluijs, P. Matthijs and Isgum, Ivana and Emmer, Bart J. and Majoie, Charles B.L.M. and Marquering, Henk and Gavves, Efstratios},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1669--1687},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/vries26a/vries26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/vries26a.html},
  abstract = 	 {Time-resolved CT imaging can aid acute ischemic stroke diagnosis by visualizing contrast agent transport through the brain (micro)vasculature. CT perfusion imaging, while widely used for stroke diagnosis, requires approximately 30 sequential scans, leading to extensive radiation exposure and motion sensitivity. As an alternative to CTP perfusion imaging, some hospitals opt for multiphase CT angiography for time-resolved analysis with reduced radiation dose. However, multiphase CT angiography lacks standardized perfusion analysis capabilities, making it more challenging to interpret than CT perfusion imaging. We present Sparse Temporal Attenuation Reconstruction (STAR), a novel approach using conditional neural fields that reconstructs tissue attenuation curves from sparse observations, allowing for reduced radiation exposure and motion sensitivity with CT perfusion, while enabling perfusion analysis from multiphase CT angiography. Our method generates full tissue attenuation curves using only 4 out of 30 observations. The results show that perfusion maps from reconstructed data match the reference perfusion maps, potentially reducing radiation and allowing recovery of motion-corrupted images. Moreover, STAR enables perfusion analysis in centers using multiphase CT angiography. Consequently, STAR has the potential to improve the stroke imaging work-up while making perfusion analysis more widely accessible.}
}



@InProceedings{pmlr-v301-wang26a,
  title = 	 {STNAGNN: Data-driven Spatio-temporal Brain Connectivity beyond FC},
  author =       {Wang, Jiyao and Dvornek, Nicha C and Duan, Peiyu and Staib, Lawrence H. and Ventola, Pamela and Duncan, James S},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1688--1705},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/wang26a/wang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/wang26a.html},
  abstract = 	 {In recent years, graph neural networks (GNNs) have been widely applied in the analysis of brain fMRI, yet defining the connectivity between ROIs remains a challenge in noisy fMRI data. Among all approaches, Functional Connectome (FC) is the most popularmethod. Computed by the correlation coefficients between ROI time series, FC is a powerful and computationally efficient way to estimate ROI connectivity. However, it is well known for neglecting structural connections and causality in ROI interactions. Also, FCbecomes much more noisy in the short spatio-temporal sliding-window subsequences of fMRI. Effective Connectome (EC) is proposed as a directional alternative, but it is difficult to accurately estimate. Furthermore, for optimal GNN performance, usually only a small percentage of the strongest connections are selected as sparse edges, resulting in oversimplification of complex brain connections. To tackle these challenges, we propose the Spatio-Temporal Node Attention Graph Neural Network (STNAGNN) as a data-driven alternative that combines sparse predefined FC with dense data-driven spatio-temporal connections, allowing for flexible and spatio-temporal learning of ROI interaction patterns.}
}



@InProceedings{pmlr-v301-wang26b,
  title = 	 {Predicting Prostate Cancer Progression During Active Surveillance Using Longitudinal bpMRI Scans and A Multi-scale Foundation Model},
  author =       {Wang, Yifan and Lou, Bin and von Busch, Heinrich and Grimm, Robert and Punnen, Sanoj and Comaniciu, Dorin and Kamen, Ali and Huisman, Henkjan and Tong, Angela and Winkel, David and Penzkofer, Tobias and Shabunin, Ivan and Choi, Moon Hyung and Yang, Qingsong and Szolar, Dieter and Shea, Steven and Coakley, Fergus and Harisinghani, Mukesh},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1706--1722},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/wang26b/wang26b.pdf},
  url = 	 {https://proceedings.mlr.press/v301/wang26b.html},
  abstract = 	 {Active Surveillance (AS) is the recommended management strategy for patients with low- or intermediate-risk Prostate Cancer (PCa), providing a safe alternative that helps avoid the adverse effects of overtreatment. While artificial intelligence (AI)-based models for PCa detection have been extensively studied, their application in AS remains challenging, with limited research addressing the detection of PCa progression in AS scenarios. In this study, we present a novel framework for predicting PCa progression within AS protocols using bi-parametric MRI (bpMRI). Due to the limited availability of longitudinal bpMRI scans (206 patients in our study), we first developed a multi-scale foundation model trained on a large cohort of single-year bpMRI scans, comprising 5,162 patients from 10 different institutions. Building on this foundation model, we designed a three-module framework: (1) a lesion detection module to identify PCa lesions in full bpMRI scans, (2) a lesion classification module to perform detailed analysis of the identified lesion regions, and (3) a multi-scan lesion progression prediction module to assess changes in lesions over time using longitudinal bpMRI patches. The proposed framework was evaluated on a cohort from an AS clinical trial and demonstrated significant performance improvements over baseline models and radiologists, highlighting its potential to enhance clinical decision-making in AS management.}
}



@InProceedings{pmlr-v301-ward26a,
  title = 	 {Improving brain disorder diagnosis with advanced brain function representation and Kolmogorov-Arnold Networks},
  author =       {Ward, Tyler and Imran, Abdullah Al Zubaer},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1723--1739},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/ward26a/ward26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/ward26a.html},
  abstract = 	 {Quantifying functional connectivity (FC), a vital metric for the diagnosis of various brain disorders traditionally relies on the use of a pre-defined brain atlas. However, using such atlases can lead to issues regarding selection bias and lack of regard for specificity. Ad- dressing this, we propose a novel transformer-based classification network (ABFR-KAN) with effective brain function representation, to aid in diagnosing autism spectrum disorder (ASD). ABFR-KAN leverages Kolmogorov-Arnold Network (KAN) blocks replacing traditional multi-layer perceptron (MLP) components. Thorough experimentation reveals the effectiveness of ABFR-KAN in improving the diagnosis of ASD under various configurations of the model architecture. Our code is available at https://github.com/tbwa233/ABFR-KAN.}
}



@InProceedings{pmlr-v301-yao26a,
  title = 	 {LOTUS: Latent Outpainting Diffusion Model for Three-Dimensional Ultrasound Stitching},
  author =       {Yao, Xing and Yu, Runxuan and DiSanto, Nick and Aghdam, Ehsan Khodapanah and Oguine, Kanyifeechukwu Jane and Lu, Daiwei and Lou, Ange and Wang, Jiacheng and Hu, Dewei and Arenas, Gabriel A and Oguz, Baris and Pouch, Alison Marie and Schwartz, Nadav and Byram, Brett and Oguz, Ipek},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1740--1754},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/yao26a/yao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/yao26a.html},
  abstract = 	 {3D ultrasound (3DUS) stitching can enlarge the field-of-view (FOV) by registering partially overlapping 3DUS images collected from different probe positions. However, standard registration algorithms frequently encounter difficulties with this task, primarily due to the sector-shaped FOV, which often leads to pronounced local minima, thereby obstructing optimization efforts.To address these limitations, we propose LOTUS, a novel Latent Diffusion Model (LDM) specifically designed for 3DUS FOV outpainting. LOTUS innovatively encodes the 3DUS data into a compact latent space and performs outpainting at test time, effectively extending the sector-shaped FOV into a standard rectangular shape. This transformation facilitates a more robust registration by mitigating the issues of local minima associated with the original FOV shape. Experimental results show that LOTUS significantly improves the accuracy of the registration as well as the efficiency of the outpainting process compared to existing models. The code is available at https://github.com/MedICL-VU/LOTUS.}
}



@InProceedings{pmlr-v301-yassine26a,
  title = 	 {LiFE-Net: Longitudinal information Fusion for Enhanced lesion detection in unsupervised learning contexts},
  author =       {Yassine, Walid and Charachon, Martin and Hudelot, Celine and Ardon, Roberto},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1755--1770},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/yassine26a/yassine26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/yassine26a.html},
  abstract = 	 {Accurate detection of liver lesions in longitudinal follow-up is critical for assessing disease progression. Unlike clinical practices that compare multiple time points, most deep-learning approaches treat these time points independently. Existing longitudinal imaging methods, particularly in brain imaging, use strategies like channel-wise concatenation, recurrent architectures, or temporal difference computation. However, these methods might fall short in liver imaging due to challenges like non-rigid motions, anatomical variability, and changes in imaging conditions.To address these challenges, we introduce LiFE-Net, the first framework to integrate longitudinal information from baseline liver CT scans through feature fusion. Our method employs intermediate feature fusion via self-attention mechanisms, leveraging baseline images to incorporate longitudinal information for more accurate predictions. We adopt an unsupervised training approach using synthetic lesions to address the lack of supervised datasets for longitudinal liver tumors.Our results show improvements in detection performance on follow-up images when baseline information is incorporated, with gains in both detection mAP and ROC AUC per exam metrics. An exhaustive ablation study further highlights the impact of baseline image integration, registration quality, and architectural components in achieving these improvements. Our code for LiFE-Net is made publicly available at: https://github.com/walid-yassine/LiFE-Net}
}



@InProceedings{pmlr-v301-yiasemis26a,
  title = 	 {Joint Supervised and Self-supervised Learning for MRI Reconstruction},
  author =       {Yiasemis, George and Moriakov, Nikita and S\'anchez, Clara I. and Sonke, Jan-Jakob and Teuwen, Jonas},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1771--1794},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/yiasemis26a/yiasemis26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/yiasemis26a.html},
  abstract = 	 {Magnetic Resonance Imaging (MRI) is a crucial modality but, its inherently slow acquisition process poses challenges in obtaining fully-sampled $k$-space data under motion. The lack of fully-sampled acquisitions, serving as ground truths, complicates the training of deep learning (DL) algorithms in a supervised manner.  To address this limitation, self-supervised learning (SSL) methods have emerged as a viable alternative, leveraging available subsampled $k$-space data to train neural networks for MRI reconstruction. Nevertheless, these approaches often fall short when compared to supervised learning (SL). We propose Joint Supervised and Self-supervised Learning (JSSL), a novel training approach for DL-based MRI reconstruction algorithms aimed at enhancing reconstruction quality in cases where target datasets containing fully-sampled $k$-space measurements are unavailable. JSSL operates by simultaneously training a model in a SSL setting, using subsampled data from the target dataset(s), and in a SL manner, utilizing proxy datasets with fully-sampled $k$-space data. We demonstrate JSSLś efficacy using two distinct combinations of target and proxy data. Quantitative and qualitative results showcase substantial improvements over conventional SSL methods. Furthermore, we provide r̈ule-of-thumb\"{guidelines} for training MRI reconstruction models. Our code is available at https://github.com/NKI-AI/direct.}
}



@InProceedings{pmlr-v301-yu26a,
  title = 	 {A Novel GNN Framework Integrating Neuroimaging and Behavioral Information to Understand Adolescent Psychiatric Disorders},
  author =       {Yu, Weifeng and Qu, Gang and Kim, Young-geun and Xu, Lei and Zhang, Aiying},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1795--1810},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/yu26a/yu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/yu26a.html},
  abstract = 	 {Functional connectivity (FC) is widely used to study various psychiatric disorders, but its consistency is often undermined by significant inter-subject variability. While these differences can be reflected in behavioral characteristics, few studies have combined them with FC. To this end, we propose a novel graph learning framework that enhances the differentiation of psychiatric disorders by integrating FC and behavioral characteristics. Additionally, we apply Grad-CAM to enhance model interpretability by identifying key regions of interest involved in distinguishing individuals with psychiatric disorders from healthy controls. Experiments with the Adolescent Brain Cognitive Development dataset highlighted two critical insights: the thalamus and specific ROIs within the somatomotor and cingulo-opercular networks play a critical role for identifying psychiatric disorders. Additionally, visualization of latent representations demonstrated that individuals with externalizing disorders, specifically Attention Deficit Hyperactivity Disorder and Oppositional Defiant Disorder, can be distinguished from healthy controls. These findings underscore the utility of our graph learning framework for identifying psychiatric disorders and suggest its promise for improving diagnostic accuracy.}
}



@InProceedings{pmlr-v301-zhang26a,
  title = 	 {How to select slices for annotation to train best-performing deep learning segmentation models for cross-sectional medical images?},
  author =       {Zhang, Yixin and Kramer, Kevin and Mazurowski, Maciej A},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1811--1831},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/zhang26a/zhang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/zhang26a.html},
  abstract = 	 {Automated segmentation of medical images heavily relies on the availability of precise manual annotations. However, generating these annotations is often time-consuming, expensive, and sometimes requires specialized expertise (especially for cross-sectional medical images). Therefore, it is essential to optimize the use of annotation resources to ensure efficiency and effectiveness. In this paper, we systematically address the question: ïn a non-interactive annotation pipeline, how should slices from cross-sectional medical images be selected for annotation to maximize the performance of the resulting deep learning segmentation models?Ẅe conducted experiments on 4 medical imaging segmentation tasks with varying annotation budgets, numbers of annotated cases, numbers of annotated slices per volume, slice selection techniques, and mask interpolations. We found that:1) It is almost always preferable to annotate fewer slices per volume and more volumes given an annotation budget. 2) Selecting slices for annotation by unsupervised active learning (UAL) is not superior to selecting slices randomly or at fixed intervals, provided that each volume is allocated the same number of annotated slices. 3) Interpolating masks between annotated slices rarely enhances model performance, with exceptions of some specific configuration for 3D models.}
}



@InProceedings{pmlr-v301-zhang26b,
  title = 	 {SSDD-GAN: Single-Step Denoising Diffusion GAN for Cochlear Implant Surgical Scene Completion},
  author =       {Zhang, Yike and Davalos, Eduardo and Noble, Jack},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1832--1844},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/zhang26b/zhang26b.pdf},
  url = 	 {https://proceedings.mlr.press/v301/zhang26b.html},
  abstract = 	 {Recent deep learning-based image completion methods, including both inpainting and outpainting, have demonstrated promising results in restoring corrupted images by effectively filling various missing regions. Among these, Generative Adversarial Networks (GANs) and Denoising Diffusion Probabilistic Models (DDPMs) have been employed as key generative image completion approaches, excelling in the field of generating high-quality restorations with reduced artifacts and improved fine details. In previous work, we developed a method aimed at synthesizing views from novel microscope positions for mastoidectomy surgeries; however, that approach did not have the ability to restore the surrounding surgical scene environment. In this paper, we propose an efficient method to complete the surgical scene of the synthetic postmastoidectomy dataset. Our approach leverages self-supervised learning on real surgical datasets to train a Single-Step Denoising Diffusion-GAN (SSDD-GAN), combining the advantages of diffusion models with the adversarial optimization of GANs for improved Structural Similarity results of 6%. The trained model is then directly applied to the synthetic postmastoidectomy dataset using a zero-shot approach, enabling the generation of realistic and complete surgical scenes without the need for explicit ground-truth labels from the synthetic postmastoidectomy dataset. This method addresses key limitations in previous work, offering a novel pathway for full surgical microscopy scene completion and enhancing the usability of the synthetic postmastoidectomy dataset in surgical preoperative planning and intraoperative navigation.}
}



@InProceedings{pmlr-v301-zhang26c,
  title = 	 {MedCL: Learn Consistent Anatomy Distribution for Scribble-supervised Medical Image Segmentation},
  author =       {Zhang, Ke and Patel, Vishal M.},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1845--1865},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/zhang26c/zhang26c.pdf},
  url = 	 {https://proceedings.mlr.press/v301/zhang26c.html},
  abstract = 	 {Curating large-scale fully annotated datasets is expensive, laborious, and cumbersome, especially for medical images.  Several methods have been proposed in the literature that make use of weak annotations in the form of scribbles. However, these approaches require large amounts of scribble annotations, and are only applied to the segmentation of regular organs, which are often unavailable for the disease species that fall in the long-tailed distribution. Motivated by the fact that the medical labels have anatomy distribution priors, we propose a scribble-supervised clustering-based framework, called MedCL, to learn the inherent anatomy distribution of medical labels. Our approach consists of two steps:i) Shuffle the features with intra- and inter-image mix operations, and ii) Perform feature clustering and regularize the anatomy distribution at both local and global levels. Combined with a small amount of weak supervision,  the proposed MedCL is able to segment both regular organs and challenging irregular pathologies. We implement MedCL based on SAM and UNet backbones, and evaluate the performance on three open datasets of regular structure (MSCMRseg), multiple organs (BTCV) and irregular pathology (MyoPS). It is shown that even with less scribble supervision, MedCL substantially outperforms the conventional segmentation methods. Our code is available at https://github.com/BWGZK-keke/MedCL.}
}



@InProceedings{pmlr-v301-zhang26d,
  title = 	 {Anatomy-Guided Surface Diffusion Model for Alzheimer’s Disease Normative Modeling},
  author =       {Zhang, Jianwei and Shi, Yonggang},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1866--1878},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/zhang26d/zhang26d.pdf},
  url = 	 {https://proceedings.mlr.press/v301/zhang26d.html},
  abstract = 	 {Normative modeling has emerged as a pivotal approach for characterizing heterogeneityand individual variance in neurodegenerative diseases, notably Alzheimer’s disease (AD).One of the challenges of cortical normative modeling is the anatomical structure mismatchdue to folding pattern variability. Traditionally, registration is applied to address this issueand recently deep generative models are employed to generate anatomically aligned sam-ples for analyzing disease progression; however, these models are predominantly appliedto volume-based data, which often falls short in capturing intricate morphological changeson the brain cortex. As an alternative, surface-based analysis has been proven to be moresensitive in disease modeling such as AD. Yet, like volume-based data, it also suffers fromthe mismatch problem. To address these limitations, we propose a novel generative nor-mative modeling framework by transferring the conditional diffusion generative model tothe spherical domain. Furthermore, the proposed model generates normal feature mapdistributions by explicitly conditioning on individual anatomical segmentation to ensurebetter geometrical alignment which helps to reduce variance between subjects in norma-tive analysis. We find that our model can generate samples that are better anatomicallyaligned than registered reference data and through ablation study and normative assess-ment experiments, the samples are able to better measure individual differences from thenormal distribution and increase sensitivity in differentiating cognitively normal (CN), mildcognitive impairment (MCI), and Alzheimer’s disease (AD) patients.}
}



@InProceedings{pmlr-v301-zhao26a,
  title = 	 {4D-VQ-GAN: A World Model for Synthesizing Medical Scans at Any Time Point for Personalized Disease Progression Modeling of Idiopathic Pulmonary Fibrosis},
  author =       {Zhao, An and Xu, Moucheng and Shahin, Ahmed H. and Wuyts, Wim and Jones, Mark G. and Jacob, Joseph and Alexander, Daniel C.},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1879--1909},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/zhao26a/zhao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/zhao26a.html},
  abstract = 	 {Understanding the progression trajectories of diseases is crucial for early diagnosis and effective treatment planning. This is especially vital for life-threatening conditions such as Idiopathic Pulmonary Fibrosis (IPF), a chronic, progressive lung disease with a prognosis comparable to many cancers. Computed tomography (CT) imaging has been established as a reliable diagnostic tool for IPF. Accurately predicting future CT scans of early-stage IPF patients can aid in developing better treatment strategies, thereby improving survival outcomes. As inspired by the recent success of world models in generating video-based virtual physical worlds, we present the first world model for IPF, to synthesize realistic scans of early-stage IPF patients at any time point. We term our model 4D Vector Quantised Generative Adversarial Networks (4D-VQ-GAN). Our model is trained using a two-stage approach. In the first stage, a 3D-VQ-GAN is trained to reconstruct CT volumes. In the second stage, a Neural Ordinary Differential Equation (ODE) model is trained to capture the temporal dynamics of the quantised embeddings, which are generated by the encoder trained in the first stage. For clinical validation, we conduct survival analysis using imaging biomarkers derived from generated CT scans and achieve a C-index either better than or comparable to that of biomarkers derived from the real CT scans. The survival analysis results suggest the potential clinical utility inherent to generated longitudinal CT scans, showing that they can reliably predict survival outcomes. The code is publicly available at https://github.com/anzhao920/4DVQGAN.}
}



@InProceedings{pmlr-v301-zhou26a,
  title = 	 {SurgicalSemiSeg: A Semi-Supervised Framework for Laparoscopic Image Segmentation},
  author =       {Zhou, Yuning and Badgery, Henry and Read, Matthew and Bailey, James and Davey, Catherine E},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1910--1929},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/zhou26a/zhou26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/zhou26a.html},
  abstract = 	 {Deep learning applications in surgery are heavily reliant on large-scale datasets with high-quality annotations, which are costly and time-consuming to obtain. Self-supervised learning (SSL) has shown significant potential for reducing reliance on labelled data.This work investigates the use of SSL for semantic segmentation in laparoscopic cholecystectomy (LC) surgery. Through evaluation of existing SSL methods, we find that pixel-level objectives enable the most effective representation learning for laparoscopic imaging, characterised by highly variable and deformable anatomy. Building on this insight, we develop a tailored masked denoising autoencoder with a carefully optimised masking ratio and patch size for semantic segmentation. Our method achieves state-of-the-art performance across three LC datasets. Of note, it significantly improves segmentation accuracy for critical anatomical structures that are under-represented in training datasets. Furthermore, our approach achieves generalisability, with pre-trained representations performing effectively across fine-tuning datasets from different LC datasets.}
}



@InProceedings{pmlr-v301-zhu26a,
  title = 	 {Point-Based Shape Representation Generation with a Correspondence-Preserving Diffusion Model},
  author =       {Zhu, Shen and Jin, Yinzhu and Zawar, Ifrah and Fletcher, Tom},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1930--1942},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/zhu26a/zhu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v301/zhu26a.html},
  abstract = 	 {We propose a diffusion model designed to generate point-based shape representations with correspondences.Traditional statistical shape models have considered point correspondences extensively, but current deep learning methods do not take them into account, focusing on unordered point clouds instead. Current deep generative models for point clouds do not address generating shapes with point correspondences between generated shapes.This work aims to formulate a diffusion model that is capable of generating realistic point-based shape representations, which preserve point correspondences that are present in the training data.Using shape representation data with correspondences derived from Open Access Series of Imaging Studies 3 (OASIS-3), we demonstrate that our correspondence-preserving model effectively generates point-based hippocampal shape representations that are highly realistic compared to existing methods. We further demonstrate the applications of our generative model by downstream tasks, such as conditional generation of healthy and AD subjects and predicting morphological changes of disease progression by counterfactual generation.}
}



@InProceedings{pmlr-v301-zhu26b,
  title = 	 {MagNet: Multi-Level Attention Graph Network for Predicting High-Resolution Spatial Transcriptomics},
  author =       {Zhu, Junchao and Deng, Ruining and Yao, Tianyuan and Xiong, Juming and Qu, Chongyu and Guo, Junlin and Lu, Siqi and Tang, Yucheng and Xu, Daguang and Yin, Mengmeng and Wang, Yu and Zhao, Shilin and Wang, Yaohong and Yang, Haichun and Huo, Yuankai},
  booktitle = 	 {Proceedings of The 8th International Conference on Medical Imaging with Deep Learning},
  pages = 	 {1943--1955},
  year = 	 {2026},
  editor = 	 {Tasdizen, Tolga and Elhabian, Shireen and Summers, Ronald and Chen, Chen and Koch, Lisa and Zhuang, Yan},
  volume = 	 {301},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--11 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v301/main/assets/zhu26b/zhu26b.pdf},
  url = 	 {https://proceedings.mlr.press/v301/zhu26b.html},
  abstract = 	 {The rapid development of spatial transcriptomics (ST) offers new opportunities to explore the gene expression patterns within the spatial microenvironment. Current research integrates pathological images to infer gene expression, addressing the high costs and time-consuming processes to generate spatial transcriptomics data. However, as spatial transcriptomics resolution continues to improve, existing methods remain primarily focused on gene expression prediction at low-resolution (55$\mu$m) spot levels. These methods face significant challenges, especially the information bottleneck, when they are applied to high-resolution (8$\mu$m) HD data. To bridge this gap, this paper introduces MagNet, a multi-level attention graph network designed for accurate prediction of high-resolution HD data. MagNet employs cross-attention layers to integrate features from multi-resolution image patches hierarchically and utilizes a GAT-Transformer module to aggregate neighborhood information. By integrating multilevel features, MagNet overcomes the limitations posed by low-resolution inputs in predicting high-resolution gene expression. We systematically evaluated MagNet and existing ST prediction models on both a private spatial transcriptomics dataset and a public dataset at three different resolution levels. The results demonstrate that MagNet achieves state-of-the-art performance at both spot level and high-resolution bin levels, providing a novel methodology and benchmark for future research and applications in high-resolution HD-level spatial transcriptomics. Code is available at https://github.com/Junchao-Zhu/MagNet.}
}



