2026

Konstantinos I. Roumeliotis; Ranjan Sapkota; Manoj Karkee; Nikolaos D. Tselikas
Agentic AI With Orchestrator-Agent Trust: A Modular Visual Classification Framework With Trust-Aware Orchestration and RAG-Based Reasoning Journal Article
In: IEEE Access, vol. 14, pp. 26965–26982, 2026, ISSN: 2169-3536.
Abstract | Links | BibTeX | Tags: Accuracy, Adaptation models, Agentic AI, Artificial intelligence, Calibration, Cognition, Costs, orchestrator agent trust, Retrieval augmented generation, retrieval augmented reasoning, Training, trust orchestration, visual classification, Visualization
@article{roumeliotis_agentic_2026,
title = {Agentic AI With Orchestrator-Agent Trust: A Modular Visual Classification Framework With Trust-Aware Orchestration and RAG-Based Reasoning},
author = {Konstantinos I. Roumeliotis and Ranjan Sapkota and Manoj Karkee and Nikolaos D. Tselikas},
url = {https://ieeexplore.ieee.org/document/11373381/},
doi = {10.1109/ACCESS.2026.3662282},
issn = {2169-3536},
year = {2026},
date = {2026-01-01},
urldate = {2026-01-01},
journal = {IEEE Access},
volume = {14},
pages = {26965\textendash26982},
abstract = {Modern Artificial Intelligence (AI) increasingly relies on multi-agent architectures that blend visual and language understanding. Yet, a pressing challenge remains: How can we trust these agents especially in zero-shot settings with no fine-tuning? We introduce a novel modular Agentic AI visual classification framework that integrates generalist multimodal agents with a non-visual reasoning orchestrator and a Retrieval-Augmented Generation (RAG) module. Applied to apple leaf disease diagnosis, we benchmark three configurations: (I) zero-shot with confidence-based orchestration, (II) fine-tuned agents with improved performance, and (III) trust-calibrated orchestration enhanced by CLIP-based image retrieval and re-evaluation loops. Using confidence calibration metrics (ECE, OCR, CCC), the orchestrator modulates trust across agents. Our results demonstrate a 77.94% accuracy improvement in the zero-shot setting using trust-aware orchestration and RAG, achieving 85.63% overall. GPT-4o showed better calibration, while Qwen-2.5-VL displayed overconfidence. Furthermore, image-RAG grounded predictions with visually similar cases, enabling correction of agent overconfidence via iterative re-evaluation. The proposed system separates perception (vision agents) from meta-reasoning (orchestrator), enabling scalable and interpretable multi-agent AI. This blueprint illustrates how Agentic AI can deliver trustworthy, modular, and transparent reasoning, and is extensible to diagnostics, biology, and other trust-critical domains. In doing so, we highlight Agentic AI not just as an architecture but as a paradigm for building reliable multi-agent intelligence. All models, prompts, results, and system components including the complete software source code are openly released to support reproducibility, transparency, and community benchmarking at our Github page.},
keywords = {Accuracy, Adaptation models, Agentic AI, Artificial intelligence, Calibration, Cognition, Costs, orchestrator agent trust, Retrieval augmented generation, retrieval augmented reasoning, Training, trust orchestration, visual classification, Visualization},
pubstate = {published},
tppubtype = {article}
}
2025

Ranjan Sapkota; Marco Flores-Calero; Rizwan Qureshi; Chetan Badgujar; Upesh Nepal; Alwin Poulose; Peter Zeno; Uday Bhanu Prakash Vaddevolu; Sheheryar Khan; Maged Shoman; Hong Yan; Manoj Karkee
YOLO advances to its genesis: a decadal and comprehensive review of the You Only Look Once (YOLO) series Journal Article
In: Artificial Intelligence Review, vol. 58, no. 9, pp. 274, 2025, ISSN: 1573-7462.
Abstract | Links | BibTeX | Tags: Agriculture, Artificial intelligence, Autonomous vehicles, CNN, Computer vision, Deep learning, Healthcare and medical imaging, Industrial manufacturing, Real-time object detection, Surveillance, Traffic safety, YOLO, YOLO configurations, YOLOv1 to YOLOv12, You Only Look Once
@article{sapkota_yolo_2025,
title = {YOLO advances to its genesis: a decadal and comprehensive review of the You Only Look Once (YOLO) series},
author = {Ranjan Sapkota and Marco Flores-Calero and Rizwan Qureshi and Chetan Badgujar and Upesh Nepal and Alwin Poulose and Peter Zeno and Uday Bhanu Prakash Vaddevolu and Sheheryar Khan and Maged Shoman and Hong Yan and Manoj Karkee},
url = {https://doi.org/10.1007/s10462-025-11253-3},
doi = {10.1007/s10462-025-11253-3},
issn = {1573-7462},
year = {2025},
date = {2025-06-01},
urldate = {2025-06-01},
journal = {Artificial Intelligence Review},
volume = {58},
number = {9},
pages = {274},
abstract = {This review systematically examines the progression of the You Only Look Once (YOLO) object detection algorithms from YOLOv1 to the recently unveiled YOLOv12. Employing a reverse chronological analysis, this study examines the advancements introduced by YOLO algorithms, beginning with YOLOv12 and progressing through YOLO11 (or YOLOv11), YOLOv10, YOLOv9, YOLOv8, and subsequent versions to explore each version’s contributions to enhancing speed, detection accuracy, and computational efficiency in real-time object detection. Additionally, this study reviews the alternative versions derived from YOLO architectural advancements of YOLO-NAS, YOLO-X, YOLO-R, DAMO-YOLO, and Gold-YOLO. Moreover, the study highlights the transformative impact of YOLO models across five critical application areas: autonomous vehicles and traffic safety, healthcare and medical imaging, industrial manufacturing, surveillance and security, and agriculture. By detailing the incremental technological advancements in subsequent YOLO versions, this review chronicles the evolution of YOLO, and discusses the challenges and limitations in each of the earlier versions. The evolution signifies a path towards integrating YOLO with multimodal, context-aware, and Artificial General Intelligence (AGI) systems for the next YOLO decade, promising significant implications for future developments in AI-driven applications.},
keywords = {Agriculture, Artificial intelligence, Autonomous vehicles, CNN, Computer vision, Deep learning, Healthcare and medical imaging, Industrial manufacturing, Real-time object detection, Surveillance, Traffic safety, YOLO, YOLO configurations, YOLOv1 to YOLOv12, You Only Look Once},
pubstate = {published},
tppubtype = {article}
}
2024

Ranjan Sapkota; Dawood Ahmed; Manoj Karkee
Comparing YOLOv8 and Mask R-CNN for instance segmentation in complex orchard environments Journal Article
In: Artificial Intelligence in Agriculture, vol. 13, pp. 84–99, 2024, ISSN: 2589-7217.
Abstract | Links | BibTeX | Tags: Artificial intelligence, Automation, Deep learning, Machine Learning, Machine vision, Mask R-CNN, Robotics, YOLOv8
@article{sapkota_comparing_2024,
title = {Comparing YOLOv8 and Mask R-CNN for instance segmentation in complex orchard environments},
author = {Ranjan Sapkota and Dawood Ahmed and Manoj Karkee},
url = {https://www.sciencedirect.com/science/article/pii/S258972172400028X},
doi = {10.1016/j.aiia.2024.07.001},
issn = {2589-7217},
year = {2024},
date = {2024-09-01},
urldate = {2024-09-01},
journal = {Artificial Intelligence in Agriculture},
volume = {13},
pages = {84\textendash99},
abstract = {Instance segmentation, an important image processing operation for automation in agriculture, is used to precisely delineate individual objects of interest within images, which provides foundational information for various automated or robotic tasks such as selective harvesting and precision pruning. This study compares the one-stage YOLOv8 and the two-stage Mask R-CNN machine learning models for instance segmentation under varying orchard conditions across two datasets. Dataset 1, collected in dormant season, includes images of dormant apple trees, which were used to train multi-object segmentation models delineating tree branches and trunks. Dataset 2, collected in the early growing season, includes images of apple tree canopies with green foliage and immature (green) apples (also called fruitlet), which were used to train single-object segmentation models delineating only immature green apples. The results showed that YOLOv8 performed better than Mask R-CNN, achieving good precision and near-perfect recall across both datasets at a confidence threshold of 0.5. Specifically, for Dataset 1, YOLOv8 achieved a precision of 0.90 and a recall of 0.95 for all classes. In comparison, Mask R-CNN demonstrated a precision of 0.81 and a recall of 0.81 for the same dataset. With Dataset 2, YOLOv8 achieved a precision of 0.93 and a recall of 0.97. Mask R-CNN, in this single-class scenario, achieved a precision of 0.85 and a recall of 0.88. Additionally, the inference times for YOLOv8 were 10.9 ms for multi-class segmentation (Dataset 1) and 7.8 ms for single-class segmentation (Dataset 2), compared to 15.6 ms and 12.8 ms achieved by Mask R-CNN's, respectively. These findings show YOLOv8's superior accuracy and efficiency in machine learning applications compared to two-stage models, specifically Mask-R-CNN, which suggests its suitability in developing smart and automated orchard operations, particularly when real-time applications are necessary in such cases as robotic harvesting and robotic immature green fruit thinning.},
keywords = {Artificial intelligence, Automation, Deep learning, Machine Learning, Machine vision, Mask R-CNN, Robotics, YOLOv8},
pubstate = {published},
tppubtype = {article}
}

Alex W. Kirkpatrick; Amanda D. Boyd; Jay D. Hmielowski
In: AI & SOCIETY, 2024, ISSN: 1435-5655.
Abstract | Links | BibTeX | Tags: Artificial intelligence, Information sharing, Media exposure, Psychological distance, Public engagement with science and technology
@article{kirkpatrick_who_2024,
title = {Who shares about AI? Media exposure, psychological proximity, performance expectancy, and information sharing about artificial intelligence online},
author = {Alex W. Kirkpatrick and Amanda D. Boyd and Jay D. Hmielowski},
url = {https://doi.org/10.1007/s00146-024-01997-x},
doi = {10.1007/s00146-024-01997-x},
issn = {1435-5655},
year = {2024},
date = {2024-06-01},
urldate = {2024-06-01},
journal = {AI \& SOCIETY},
abstract = {Media exposure can shape audience perceptions surrounding novel innovations, such as artificial intelligence (AI), and could influence whether they share information about AI with others online. This study examines the indirect association between exposure to AI in the media and information sharing about AI online. We surveyed 567 US citizens aged 18 and older in November 2020, several months after the release of Open AI’s transformative GPT-3 model. Results suggest that AI media exposure was related to online information sharing through psychological proximity to the impacts of AI and positive AI performance expectancy in serial mediation. This positive indirect association became stronger the more an individual perceived society to be changing due to new technology. Results imply that public exposure to AI in the media could significantly impact public understanding of AI, and prompt further information sharing online.},
keywords = {Artificial intelligence, Information sharing, Media exposure, Psychological distance, Public engagement with science and technology},
pubstate = {published},
tppubtype = {article}
}
