[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"branding":3,"analytics":7,"article-inside-the-black-box-how-llms-actually-grade-essays":10,"sections":34},{"siteName":4,"siteTagline":5,"publisherName":4,"contactEmail":6},"The Revision","Tech news, decoded.","editor@therevision.news",{"gaMeasurementId":8,"adsenseClientId":9},"G-ZW2MV82GYR","ca-pub-8533917693782264",{"article":11},{"id":12,"slug":13,"title":14,"dek":15,"body_md":16,"tags_json":17,"published_at":18,"created_at":19,"updated_at":20,"status":21,"review_note":22,"review_notes":23,"image_url":22,"persona_id":22,"persona_name":22,"section":24,"tags":25,"sources":29,"feedback":33,"feedback_at":22,"cost_usd":33,"total_tokens":33},1755,"inside-the-black-box-how-llms-actually-grade-essays","Inside the Black Box: How LLMs Actually Grade Essays","New research maps exactly where and how large language models store essay quality judgments, offering rare interpretability into automated scoring systems.","Researchers have cracked open the grading logic inside large language models — and found it is more legible than expected.\n\nA new study tested eight LLMs against three essay datasets — two English (ASAP++ and CSEE) and one Portuguese (ENEM) — to trace how models internally represent essay quality. Using a mix of linear probing, dimensionality reduction, and neuron-level analysis, the researchers found that quality signals are encoded in a linearly accessible form inside the models' hidden layers. Those signals build progressively across layers rather than appearing all at once, and they hold up across different prompting strategies. They even transfer partially across essay prompts with different scoring rubrics, which is not a given when the criteria change.\n\nThe interpretability finding matters because automated essay scoring is already deployed in high-stakes contexts — standardized tests, large university courses — yet almost no one could explain what the model was actually measuring. This work identifies specific \"essay scoring neurons\" whose activations correlate with scores and respond predictably to targeted interventions, giving auditors something concrete to examine. The finding that nonlinear probes add only marginal improvement over linear ones suggests the models are not hiding quality judgments in some complex, entangled representation — the information is sitting near the surface.\n\nOne detail worth noting: longer essays push the model to rely more on deeper layers, which implies the architecture is doing something structurally different at scale rather than simply applying the same rubric to more text. That is a useful constraint for anyone building or auditing these systems — but it also raises the question of whether the neurons the researchers found are measuring genuine writing quality or a proxy that correlates with it in training data.","[\"ai\",\"machine learning\",\"education\",\"interpretability\"]","2026-06-19T04:00:00.000Z","2026-06-19T11:18:09.963Z","2026-06-19T14:22:18.522Z","published",null,[],"ai",[24,26,27,28],"machine learning","education","interpretability",[30],{"name":31,"url":32},"arXiv cs.AI","https:\u002F\u002Farxiv.org\u002Fabs\u002F2606.20152",0,{"sections":35},[36,40,44,49,54,59,64,68,72,77,82,87,92,97],{"name":37,"slug":24,"count":38,"latest_published_at":39},"AI",491,"2026-06-19T14:59:11.000Z",{"name":41,"slug":42,"count":43,"latest_published_at":18},"Security","security",132,{"name":45,"slug":46,"count":47,"latest_published_at":48},"Policy","policy",88,"2026-06-16T09:26:09.000Z",{"name":50,"slug":51,"count":52,"latest_published_at":53},"Consumer Tech","consumer-tech",78,"2026-06-16T17:58:24.000Z",{"name":55,"slug":56,"count":57,"latest_published_at":58},"Hardware","hardware",62,"2026-06-18T15:24:16.000Z",{"name":60,"slug":61,"count":62,"latest_published_at":63},"Deals","deals",58,"2026-06-19T14:43:50.000Z",{"name":65,"slug":66,"count":62,"latest_published_at":67},"Software","software","2026-06-16T20:00:00.000Z",{"name":69,"slug":70,"count":71,"latest_published_at":18},"Dev Tools","dev-tools",50,{"name":73,"slug":74,"count":75,"latest_published_at":76},"Science","science",38,"2026-06-18T04:00:00.000Z",{"name":78,"slug":79,"count":80,"latest_published_at":81},"Gaming","gaming",31,"2026-06-16T15:25:13.000Z",{"name":83,"slug":84,"count":85,"latest_published_at":86},"General","general",26,"2026-06-13T18:35:15.000Z",{"name":88,"slug":89,"count":90,"latest_published_at":91},"Startups","startups",23,"2026-06-16T15:00:00.000Z",{"name":93,"slug":94,"count":95,"latest_published_at":96},"Reviews","reviews",19,"2026-06-14T08:00:00.000Z",{"name":98,"slug":99,"count":100,"latest_published_at":101},"How-To","how-to",6,"2026-06-16T09:00:00.000Z"]