[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"branding":3,"analytics":7,"article-veritas-squeezes-more-from-ai-proof-search-by-reading-error-signals":10,"sections":34},{"siteName":4,"siteTagline":5,"publisherName":4,"contactEmail":6},"The Revision","Tech news, decoded.","editor@therevision.news",{"gaMeasurementId":8,"adsenseClientId":9},"G-ZW2MV82GYR","ca-pub-8533917693782264",{"article":11},{"id":12,"slug":13,"title":14,"dek":15,"body_md":16,"tags_json":17,"published_at":18,"created_at":19,"updated_at":20,"status":21,"review_note":22,"review_notes":23,"image_url":22,"persona_id":22,"persona_name":22,"section":24,"tags":25,"sources":29,"feedback":33,"feedback_at":22,"cost_usd":33,"total_tokens":33},1705,"veritas-squeezes-more-from-ai-proof-search-by-reading-error-signals","VERITAS Squeezes More From AI Proof Search by Reading Error Signals","A new zero-shot framework routes verifier feedback into proof search, lifting formal theorem-proving accuracy above brute-force sampling baselines.","A research framework called VERITAS improves how AI systems search for formal mathematical proofs - by actually reading the errors they generate.\n\nMost large-language-model-based provers treat a verifier's output as binary: the proof either passed or it failed. VERITAS, presented in a new paper, takes a different approach. It runs a two-phase protocol: first, Best-of-N sampling generates candidate proofs; then a critic-guided tree search uses the Phase 1 failures as explicit negative examples to steer further exploration. The result on the standard miniF2F benchmark is 40.6% accuracy, compared to 36.9% for a standalone Best-of-5 baseline and 26.2% for a portfolio approach. On a new 55-theorem combinatorics benchmark the team calls VERITAS-CombiBench, the gap is more dramatic - unguided sampling scored only 1.8% while VERITAS reached 7.3%, suggesting that brute-force generation actively hurts when finding the right lemma names requires iterating on verifier feedback.\n\nFormal theorem proving is one of the cleaner tests of whether a model actually reasons or just pattern-matches. Progress here matters because verified proofs carry mathematical certainty that no amount of confident text generation can fake. The combinatorics result is the more interesting data point: it shows that throwing more samples at a hard problem can make things worse, not just plateau.\n\nThe framework is zero-shot, meaning it requires no task-specific fine-tuning - a practical detail that distinguishes it from prior work that leans on training-time specialization. Whether VERITAS holds up across broader benchmark suites beyond miniF2F and its own combinatorics set is the obvious next question.","[\"ai\",\"formal-methods\",\"theorem-proving\",\"research\"]","2026-06-19T04:00:00.000Z","2026-06-19T10:19:07.694Z","2026-06-19T14:21:37.545Z","published",null,[],"ai",[24,26,27,28],"formal-methods","theorem-proving","research",[30],{"name":31,"url":32},"arXiv cs.AI","https:\u002F\u002Farxiv.org\u002Fabs\u002F2606.19399",0,{"sections":35},[36,40,44,49,54,59,64,68,72,77,82,87,92,97],{"name":37,"slug":24,"count":38,"latest_published_at":39},"AI",491,"2026-06-19T14:59:11.000Z",{"name":41,"slug":42,"count":43,"latest_published_at":18},"Security","security",132,{"name":45,"slug":46,"count":47,"latest_published_at":48},"Policy","policy",88,"2026-06-16T09:26:09.000Z",{"name":50,"slug":51,"count":52,"latest_published_at":53},"Consumer Tech","consumer-tech",78,"2026-06-16T17:58:24.000Z",{"name":55,"slug":56,"count":57,"latest_published_at":58},"Hardware","hardware",62,"2026-06-18T15:24:16.000Z",{"name":60,"slug":61,"count":62,"latest_published_at":63},"Deals","deals",58,"2026-06-19T14:43:50.000Z",{"name":65,"slug":66,"count":62,"latest_published_at":67},"Software","software","2026-06-16T20:00:00.000Z",{"name":69,"slug":70,"count":71,"latest_published_at":18},"Dev Tools","dev-tools",50,{"name":73,"slug":74,"count":75,"latest_published_at":76},"Science","science",38,"2026-06-18T04:00:00.000Z",{"name":78,"slug":79,"count":80,"latest_published_at":81},"Gaming","gaming",31,"2026-06-16T15:25:13.000Z",{"name":83,"slug":84,"count":85,"latest_published_at":86},"General","general",26,"2026-06-13T18:35:15.000Z",{"name":88,"slug":89,"count":90,"latest_published_at":91},"Startups","startups",23,"2026-06-16T15:00:00.000Z",{"name":93,"slug":94,"count":95,"latest_published_at":96},"Reviews","reviews",19,"2026-06-14T08:00:00.000Z",{"name":98,"slug":99,"count":100,"latest_published_at":101},"How-To","how-to",6,"2026-06-16T09:00:00.000Z"]