2022

Jonathan Dodge; Andrew A. Anderson; Matthew Olson; Rupika Dikkala; Margaret Burnett
How Do People Rank Multiple Mutant Agents? Inproceedings
In: 27th International Conference on Intelligent User Interfaces, pp. 191–211, Association for Computing Machinery, New York, NY, USA, 2022, ISBN: 978-1-4503-9144-3.
Abstract | Links | BibTeX | Tags: AI, Human-Computer Interaction
@inproceedings{dodge_how_2022,
title = {How Do People Rank Multiple Mutant Agents?},
author = { Jonathan Dodge and Andrew A. Anderson and Matthew Olson and Rupika Dikkala and Margaret Burnett},
url = {https://doi.org/10.1145/3490099.3511115},
doi = {10.1145/3490099.3511115},
isbn = {978-1-4503-9144-3},
year = {2022},
date = {2022-03-01},
urldate = {2022-03-01},
booktitle = {27th International Conference on Intelligent User Interfaces},
pages = {191--211},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
series = {IUI '22},
abstract = {Faced with several AI-powered sequential decision-making systems, how might someone choose on which to rely? For example, imagine car buyer Blair shopping for a self-driving car, or developer Dillon trying to choose an appropriate ML model to use in their application. Their first choice might be infeasible (i.e., too expensive in money or execution time), so they may need to select their second or third choice. To address this question, this paper presents: 1) Explanation Resolution, a quantifiable direct measurement concept; 2) a new XAI empirical task to measure explanations: “the Ranking Task”; and 3) a new strategy for inducing controllable agent variations\textemdashMutant Agent Generation. In support of those main contributions, it also presents 4) novel explanations for sequential decision-making agents; 5) an adaptation to the AAR/AI assessment process; and 6) a qualitative study around these devices with 10 participants to investigate how they performed the Ranking Task on our mutant agents, using our explanations, and structured by AAR/AI. From an XAI researcher perspective, just as mutation testing can be applied to any code, mutant agent generation can be applied to essentially any neural network for which one wants to evaluate an assessment process or explanation type. As to an XAI user’s perspective, the participants ranked the agents well overall, but showed the importance of high explanation resolution for close differences between agents. The participants also revealed the importance of supporting a wide diversity of explanation diets and agent “test selection” strategies.},
keywords = {AI, Human-Computer Interaction},
pubstate = {published},
tppubtype = {inproceedings}
}

Roli Khanna; Jonathan Dodge; Andrew Anderson; Rupika Dikkala; Jed Irvine; Zeyad Shureih; Kin-Ho Lam; Caleb R. Matthews; Zhengxian Lin; Minsuk Kahng; Alan Fern; Margaret Burnett
Finding AI’s Faults with AAR/AI: An Empirical Study Journal Article
In: ACM Transactions on Interactive Intelligent Systems, vol. 12, no. 1, pp. 1:1–1:33, 2022, ISSN: 2160-6455.
Abstract | Links | BibTeX | Tags: AI, Human-Computer Interaction
@article{khanna_finding_2022,
title = {Finding AI’s Faults with AAR/AI: An Empirical Study},
author = { Roli Khanna and Jonathan Dodge and Andrew Anderson and Rupika Dikkala and Jed Irvine and Zeyad Shureih and Kin-Ho Lam and Caleb R. Matthews and Zhengxian Lin and Minsuk Kahng and Alan Fern and Margaret Burnett},
url = {https://doi.org/10.1145/3487065},
doi = {10.1145/3487065},
issn = {2160-6455},
year = {2022},
date = {2022-03-01},
urldate = {2022-03-01},
journal = {ACM Transactions on Interactive Intelligent Systems},
volume = {12},
number = {1},
pages = {1:1--1:33},
abstract = {Would you allow an AI agent to make decisions on your behalf? If the answer is “not always,” the next question becomes “in what circumstances”? Answering this question requires human users to be able to assess an AI agent\textemdashand not just with overall pass/fail assessments or statistics. Here users need to be able to localize an agent’s bugs so that they can determine when they are willing to rely on the agent and when they are not. After-Action Review for AI (AAR/AI), a new AI assessment process for integration with Explainable AI systems, aims to support human users in this endeavor, and in this article we empirically investigate AAR/AI’s effectiveness with domain-knowledgeable users. Our results show that AAR/AI participants not only located significantly more bugs than non-AAR/AI participants did (i.e., showed greater recall) but also located them more precisely (i.e., with greater precision). In fact, AAR/AI participants outperformed non-AAR/AI participants on every bug and were, on average, almost six times as likely as non-AAR/AI participants to find any particular bug. Finally, evidence suggests that incorporating labeling into the AAR/AI process may encourage domain-knowledgeable users to abstract above individual instances of bugs; we hypothesize that doing so may have contributed further to AAR/AI participants’ effectiveness.},
keywords = {AI, Human-Computer Interaction},
pubstate = {published},
tppubtype = {article}
}
2021

Jonathan Dodge; Roli Khanna; Jed Irvine; Kin-ho Lam; Theresa Mai; Zhengxian Lin; Nicholas Kiddle; Evan Newman; Andrew Anderson; Sai Raja; Caleb Matthews; Christopher Perdriau; Margaret Burnett; Alan Fern
After-Action Review for AI (AAR/AI) Journal Article
In: ACM Transactions on Interactive Intelligent Systems, vol. 11, no. 3-4, pp. 29:1–29:35, 2021, ISSN: 2160-6455.
Abstract | Links | BibTeX | Tags: AI, Human-Computer Interaction
@article{dodge_after-action_2021,
title = {After-Action Review for AI (AAR/AI)},
author = { Jonathan Dodge and Roli Khanna and Jed Irvine and Kin-ho Lam and Theresa Mai and Zhengxian Lin and Nicholas Kiddle and Evan Newman and Andrew Anderson and Sai Raja and Caleb Matthews and Christopher Perdriau and Margaret Burnett and Alan Fern},
url = {https://doi.org/10.1145/3453173},
doi = {10.1145/3453173},
issn = {2160-6455},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
journal = {ACM Transactions on Interactive Intelligent Systems},
volume = {11},
number = {3-4},
pages = {29:1--29:35},
abstract = {Explainable AI is growing in importance as AI pervades modern society, but few have studied how explainable AI can directly support people trying to assess an AI agent. Without a rigorous process, people may approach assessment in ad hoc ways\textemdashleading to the possibility of wide variations in assessment of the same agent due only to variations in their processes. AAR, or After-Action Review, is a method some military organizations use to assess human agents, and it has been validated in many domains. Drawing upon this strategy, we derived an After-Action Review for AI (AAR/AI), to organize ways people assess reinforcement learning agents in a sequential decision-making environment. We then investigated what AAR/AI brought to human assessors in two qualitative studies. The first investigated AAR/AI to gather formative information, and the second built upon the results, and also varied the type of explanation (model-free vs. model-based) used in the AAR/AI process. Among the results were the following: (1) participants reporting that AAR/AI helped to organize their thoughts and think logically about the agent, (2) AAR/AI encouraged participants to reason about the agent from a wide range of perspectives, and (3) participants were able to leverage AAR/AI with the model-based explanations to falsify the agent’s predictions.},
keywords = {AI, Human-Computer Interaction},
pubstate = {published},
tppubtype = {article}
}