{data.map((item, idx) => { const value = item[valueKey]; const percentage = (value - baselineOffset) / (topValue - baselineOffset) * 80; const isDroid = item[labelKey].toLowerCase().includes('droid') || item[labelKey].toLowerCase().includes('factory'); return

{idx + 1} {item[labelKey]}

{typeof value === 'number' && value % 1 !== 0 ? value.toFixed(1) : value}{valueLabel.includes('%') ? '%' : ''}

; })}

; }; export const terminalBenchData = [{ name: "Factory Droid", model: "Claude Opus 4.5", accuracy: 63.1 }, { name: "OpenAI Codex CLI", model: "GPT-5.1-Codex-Max", accuracy: 60.4 }, { name: "Warp", model: "Claude Opus 4.5", accuracy: 59.1 }, { name: "OpenHands", model: "Gemini 3 Pro", accuracy: 43.8 }, { name: "Anthropic Claude Code", model: "Gemini 3 Pro", accuracy: 40.1 }]; Benchmark from [tbench.ai](https://www.tbench.ai) evaluating AI coding agents on real-world software engineering tasks using terminal-based interfaces. Measures how effectively agents can navigate codebases, execute commands, and implement solutions through command-line interactions. ### Results *Last updated: December 2025* ### Methodology | Category | Description | | -------------------------- | --------------------------------------- | | **Code Navigation** | Finding and understanding relevant code | | **Bug Fixing** | Identifying and resolving issues | | **Feature Implementation** | Adding new functionality | | **Refactoring** | Improving existing code structure | | **Testing** | Writing and running tests | Tasks are scored on **correctness**, **efficiency**, and **code quality**. View live rankings and submit your agent