[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"branding":3,"analytics":7,"article-longer-math-problems-trip-up-ai-models-more-often":10,"sections":34},{"siteName":4,"siteTagline":5,"publisherName":4,"contactEmail":6},"The Revision","Tech news, decoded.","editor@therevision.news",{"gaMeasurementId":8,"adsenseClientId":9},"G-ZW2MV82GYR","ca-pub-8533917693782264",{"article":11},{"id":12,"slug":13,"title":14,"dek":15,"body_md":16,"tags_json":17,"published_at":18,"created_at":19,"updated_at":20,"status":21,"review_note":22,"review_notes":23,"image_url":22,"persona_id":22,"persona_name":22,"section":24,"tags":25,"sources":29,"feedback":33,"feedback_at":22,"cost_usd":33,"total_tokens":33},1784,"longer-math-problems-trip-up-ai-models-more-often","Longer math problems trip up AI models more often","A new adversarial benchmark finds that both longer prompts and longer solutions reliably predict when large language models will fail at math.","Longer problems, more failures — that is the headline finding from a new study on how math benchmark structure shapes AI performance.\n\nResearchers built an adversarial dataset of expert-authored mathematics problems and measured how two structural variables — prompt length and solution length — relate to model accuracy. Both correlated positively with failure: the longer the question or the longer its expected answer, the more likely models were to get it wrong. A secondary analysis looked at cross-model disagreement and found that, after adjusting for difficulty, both variables showed weak negative associations with how cleanly models separated into better and worse performers, with prompt length edging out solution length as the stronger signal.\n\nThis matters because the AI field leans heavily on math benchmarks to rank models and justify capability claims. If structural length is quietly inflating failure rates independent of actual reasoning difficulty, leaderboard comparisons may be measuring something closer to endurance than intelligence. Labs that tout benchmark scores without disclosing how problem length distributions compare across test sets are giving an incomplete picture.\n\nThe finding echoes earlier work showing that surface-level formatting choices — how a question is phrased, how many steps a solution requires — can shift model accuracy by more than the underlying math does. Benchmark design, in other words, is doing more editorial work than most announcements admit.","[\"ai\",\"benchmarks\",\"large language models\",\"math reasoning\"]","2026-06-19T04:00:00.000Z","2026-06-19T11:48:27.290Z","2026-06-19T14:22:19.225Z","published",null,[],"ai",[24,26,27,28],"benchmarks","large language models","math reasoning",[30],{"name":31,"url":32},"arXiv cs.AI","https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.07593",0,{"sections":35},[36,40,44,49,54,59,64,68,72,77,82,87,92,97],{"name":37,"slug":24,"count":38,"latest_published_at":39},"AI",491,"2026-06-19T14:59:11.000Z",{"name":41,"slug":42,"count":43,"latest_published_at":18},"Security","security",132,{"name":45,"slug":46,"count":47,"latest_published_at":48},"Policy","policy",88,"2026-06-16T09:26:09.000Z",{"name":50,"slug":51,"count":52,"latest_published_at":53},"Consumer Tech","consumer-tech",78,"2026-06-16T17:58:24.000Z",{"name":55,"slug":56,"count":57,"latest_published_at":58},"Hardware","hardware",62,"2026-06-18T15:24:16.000Z",{"name":60,"slug":61,"count":62,"latest_published_at":63},"Deals","deals",58,"2026-06-19T14:43:50.000Z",{"name":65,"slug":66,"count":62,"latest_published_at":67},"Software","software","2026-06-16T20:00:00.000Z",{"name":69,"slug":70,"count":71,"latest_published_at":18},"Dev Tools","dev-tools",50,{"name":73,"slug":74,"count":75,"latest_published_at":76},"Science","science",38,"2026-06-18T04:00:00.000Z",{"name":78,"slug":79,"count":80,"latest_published_at":81},"Gaming","gaming",31,"2026-06-16T15:25:13.000Z",{"name":83,"slug":84,"count":85,"latest_published_at":86},"General","general",26,"2026-06-13T18:35:15.000Z",{"name":88,"slug":89,"count":90,"latest_published_at":91},"Startups","startups",23,"2026-06-16T15:00:00.000Z",{"name":93,"slug":94,"count":95,"latest_published_at":96},"Reviews","reviews",19,"2026-06-14T08:00:00.000Z",{"name":98,"slug":99,"count":100,"latest_published_at":101},"How-To","how-to",6,"2026-06-16T09:00:00.000Z"]