[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"branding":3,"analytics":7,"article-ai-web-agents-cant-reliably-manage-your-privacy-settings":10,"sections":35},{"siteName":4,"siteTagline":5,"publisherName":4,"contactEmail":6},"The Revision","Tech news, decoded.","editor@therevision.news",{"gaMeasurementId":8,"adsenseClientId":9},"G-ZW2MV82GYR","ca-pub-8533917693782264",{"article":11},{"id":12,"slug":13,"title":14,"dek":15,"body_md":16,"tags_json":17,"published_at":18,"created_at":19,"updated_at":20,"status":21,"review_note":22,"review_notes":23,"image_url":22,"persona_id":22,"persona_name":22,"section":24,"tags":25,"sources":30,"feedback":34,"feedback_at":22,"cost_usd":34,"total_tokens":34},1607,"ai-web-agents-cant-reliably-manage-your-privacy-settings","AI web agents can't reliably manage your privacy settings","A new benchmark ran eight agent setups through 200 cookie, session, and account-privacy tasks across 28 sites - and toggles alone sank nearly half.","AI web agents are good at ordering groceries and bad at protecting you while they do it.\n\nA new academic benchmark called WebSP-Eval set out to measure something existing tests ignore: whether an agent can actually complete website security and privacy chores on a user's behalf. The researchers hand-built 200 tasks across 28 websites - adjusting cookie preferences, changing privacy-sensitive account settings, revoking inactive login sessions - and wired up a custom Chrome extension to reset state between runs alongside an automated grader. They ran eight agent configurations built on current multimodal language models. The headline result: the agents could not reliably finish the work, hobbled by weak autonomous exploration.\n\nThis matters because privacy housekeeping is exactly the kind of tedious task people would happily delegate, and it is exactly where the agents fall down. The study fingers stateful UI elements as the main culprit, with simple toggles alone causing more than 45% of task failures across many of the models tested. An agent that cannot tell whether a switch is already on is not one you want guarding your account.\n\nIt is a useful corrective to the broader agent hype. Benchmarks like WebArena ask whether agents can get general tasks done, and SafeArena checks whether they refuse malicious actions. Neither asks the quieter question this one does: can the assistant handle the safety settings you would most want off your plate? On this evidence, not yet.","[\"web agents\",\"ai benchmarks\",\"privacy\",\"llm\"]","2026-06-18T04:00:00.000Z","2026-06-19T05:39:37.602Z","2026-06-19T05:39:40.592Z","published",null,[],"ai",[26,27,28,29],"web agents","ai benchmarks","privacy","llm",[31],{"name":32,"url":33},"arXiv cs.AI","https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.06367",0,{"sections":36},[37,41,45,50,55,60,65,70,74,78,83,88,93,98],{"name":38,"slug":24,"count":39,"latest_published_at":40},"AI",490,"2026-06-19T04:00:00.000Z",{"name":42,"slug":43,"count":44,"latest_published_at":40},"Security","security",132,{"name":46,"slug":47,"count":48,"latest_published_at":49},"Policy","policy",88,"2026-06-16T09:26:09.000Z",{"name":51,"slug":52,"count":53,"latest_published_at":54},"Consumer Tech","consumer-tech",78,"2026-06-16T17:58:24.000Z",{"name":56,"slug":57,"count":58,"latest_published_at":59},"Hardware","hardware",62,"2026-06-18T15:24:16.000Z",{"name":61,"slug":62,"count":63,"latest_published_at":64},"Software","software",58,"2026-06-16T20:00:00.000Z",{"name":66,"slug":67,"count":68,"latest_published_at":69},"Deals","deals",56,"2026-06-19T12:30:04.000Z",{"name":71,"slug":72,"count":73,"latest_published_at":40},"Dev Tools","dev-tools",50,{"name":75,"slug":76,"count":77,"latest_published_at":18},"Science","science",38,{"name":79,"slug":80,"count":81,"latest_published_at":82},"Gaming","gaming",31,"2026-06-16T15:25:13.000Z",{"name":84,"slug":85,"count":86,"latest_published_at":87},"General","general",26,"2026-06-13T18:35:15.000Z",{"name":89,"slug":90,"count":91,"latest_published_at":92},"Startups","startups",23,"2026-06-16T15:00:00.000Z",{"name":94,"slug":95,"count":96,"latest_published_at":97},"Reviews","reviews",19,"2026-06-14T08:00:00.000Z",{"name":99,"slug":100,"count":101,"latest_published_at":102},"How-To","how-to",6,"2026-06-16T09:00:00.000Z"]