[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"branding":3,"analytics":7,"article-new-benchmark-tests-llms-on-hierarchical-legal-statutes-and-safety":10},{"siteName":4,"siteTagline":5,"publisherName":4,"contactEmail":6},"The Revision","Tech news, decoded.","editor@therevision.news",{"gaMeasurementId":8,"adsenseClientId":9},"G-ZW2MV82GYR","ca-pub-8533917693782264",{"article":11},{"id":12,"slug":13,"title":14,"dek":15,"body_md":16,"tags_json":17,"published_at":18,"created_at":19,"updated_at":20,"status":21,"review_note":22,"review_notes":23,"image_url":22,"persona_id":22,"persona_name":22,"section":22,"tags":24,"sources":28,"feedback":32,"feedback_at":22,"cost_usd":32,"total_tokens":32},1374,"new-benchmark-tests-llms-on-hierarchical-legal-statutes-and-safety","New benchmark tests LLMs on hierarchical legal statutes and safety","SearchFireSafety challenges language models to retrieve fragmented regulatory text and refuse answers when the statutory context is incomplete.","A paper on arXiv introduces SearchFireSafety, a benchmark that pushes large language models to handle statute‑centric legal questions.\n\nThe authors argue that existing legal QA tests focus on case law and ignore the layered nature of regulations. Using fire‑safety codes as a testbed, they created two types of queries: real‑world questions that need citation‑aware retrieval and synthetic prompts that deliberately omit key statutory sections. Models must pull evidence from a graph of linked documents and opt out when the missing context makes a reliable answer impossible. Results show graph‑guided retrieval lifts accuracy, but domain‑tuned models also hallucinate more often when vital passages are absent.\n\nThis matters because regulators increasingly rely on AI to draft or check compliance. A system that can surface the right clause and know when to stay silent reduces legal risk and limits the spread of misinformation. The benchmark also shines a light on a trade‑off: better domain knowledge can amplify confident errors.\n\nIf the community adopts tests like SearchFireSafety, future LLMs may become less prone to fabricate statutory citations, a step toward safer legal automation.","[\"legal-ai\",\"nlp\",\"benchmark\"]","2026-06-16T04:00:00.000Z","2026-06-17T06:31:54.686Z","2026-06-17T06:31:57.580Z","published",null,[],[25,26,27],"legal-ai","nlp","benchmark",[29],{"name":30,"url":31},"arXiv cs.AI","https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.06173",0]