[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"branding":3,"analytics":7,"article-a-new-benchmark-tests-llms-as-physician-assistants":10,"sections":34},{"siteName":4,"siteTagline":5,"publisherName":4,"contactEmail":6},"The Revision","Tech news, decoded.","editor@therevision.news",{"gaMeasurementId":8,"adsenseClientId":9},"G-ZW2MV82GYR","ca-pub-8533917693782264",{"article":11},{"id":12,"slug":13,"title":14,"dek":15,"body_md":16,"tags_json":17,"published_at":18,"created_at":19,"updated_at":20,"status":21,"review_note":22,"review_notes":23,"image_url":22,"persona_id":22,"persona_name":22,"section":24,"tags":25,"sources":29,"feedback":33,"feedback_at":22,"cost_usd":33,"total_tokens":33},1650,"a-new-benchmark-tests-llms-as-physician-assistants","A New Benchmark Tests LLMs as Physician Assistants","PhysAssistBench puts leading AI models through realistic clinical scenarios and finds them still too unreliable for real medical use.","A new benchmark reveals just how far AI models are from reliably helping doctors in practice.\n\nResearchers introduced PhysAssistBench, a benchmark built from real MIMIC-IV hospital records that simulates the full complexity of a clinical encounter. Instead of testing isolated skills — medical trivia, chart-reading, or bedside manner — it puts models through multi-turn scenarios where they must handle a physician's underspecified requests, a patient's vague symptom descriptions, and precise interaction with electronic health record systems, all at once. The evaluation set covers 1,296 manually reviewed, physician-validated turns and is available in two languages. Experiments with leading LLMs show that current models fail to hold it all together reliably.\n\nThat finding matters because it exposes a structural gap in how clinical AI gets evaluated today. Most benchmarks treat medical knowledge, communication, and tool use as separate tests; a model can ace each in isolation and still fall apart when it has to coordinate them in a single live interaction. The bottleneck, the researchers argue, is not any one capability but the coordination layer between them.\n\nThe benchmark is a useful corrective to the marketing narrative that surrounds medical AI. Vendors routinely tout high scores on clinical knowledge exams as evidence of readiness, but passing a multiple-choice board question is a different task than managing an ambiguous patient conversation while pulling the right data from a hospital record system. PhysAssistBench makes that distinction measurable rather than rhetorical.\n\nNo model cleared the bar here — which is the point. The gap between \"impressive on benchmarks\" and \"safe to put next to a patient chart\" turns out to be wide, and now there is at least one tool designed to quantify it.","[\"ai\",\"healthcare\",\"benchmarks\",\"llms\"]","2026-06-18T04:00:00.000Z","2026-06-19T09:07:15.098Z","2026-06-19T14:21:36.045Z","published",null,[],"ai",[24,26,27,28],"healthcare","benchmarks","llms",[30],{"name":31,"url":32},"arXiv cs.AI","https:\u002F\u002Farxiv.org\u002Fabs\u002F2606.18613",0,{"sections":35},[36,40,44,49,54,59,64,68,72,76,81,86,91,96],{"name":37,"slug":24,"count":38,"latest_published_at":39},"AI",490,"2026-06-19T04:00:00.000Z",{"name":41,"slug":42,"count":43,"latest_published_at":39},"Security","security",132,{"name":45,"slug":46,"count":47,"latest_published_at":48},"Policy","policy",88,"2026-06-16T09:26:09.000Z",{"name":50,"slug":51,"count":52,"latest_published_at":53},"Consumer Tech","consumer-tech",78,"2026-06-16T17:58:24.000Z",{"name":55,"slug":56,"count":57,"latest_published_at":58},"Hardware","hardware",62,"2026-06-18T15:24:16.000Z",{"name":60,"slug":61,"count":62,"latest_published_at":63},"Deals","deals",58,"2026-06-19T14:43:50.000Z",{"name":65,"slug":66,"count":62,"latest_published_at":67},"Software","software","2026-06-16T20:00:00.000Z",{"name":69,"slug":70,"count":71,"latest_published_at":39},"Dev Tools","dev-tools",50,{"name":73,"slug":74,"count":75,"latest_published_at":18},"Science","science",38,{"name":77,"slug":78,"count":79,"latest_published_at":80},"Gaming","gaming",31,"2026-06-16T15:25:13.000Z",{"name":82,"slug":83,"count":84,"latest_published_at":85},"General","general",26,"2026-06-13T18:35:15.000Z",{"name":87,"slug":88,"count":89,"latest_published_at":90},"Startups","startups",23,"2026-06-16T15:00:00.000Z",{"name":92,"slug":93,"count":94,"latest_published_at":95},"Reviews","reviews",19,"2026-06-14T08:00:00.000Z",{"name":97,"slug":98,"count":99,"latest_published_at":100},"How-To","how-to",6,"2026-06-16T09:00:00.000Z"]