🏆 LLM4ASE Leaderboard

Community-driven evaluation of LLMs on real agentic coding tasks, powered by opencode

SWE-Model-Arena pits LLMs head-to-head in blind agentic coding comparisons. Each model drives opencode to read files, write code, runs commands, and produces real git diffs — identical scaffold, different brain. Community votes determine the rankings. For technical details, check out our paper.

{

"headers": [
- "Rank",
- "Model",
- "Organization",
- "Elo Score",
- "Win Rate",
- "Conversation Efficiency Index",
- "Conversation Consistency Index",
- "Bradley-Terry Coefficient",
- "Eigenvector Centrality Value",
- "Newman Modularity Score",
- "PageRank Score"
],
"data": [
- [
  - 1,
  - "MiMo-V2-Flash",
  - "Xiaomi",
  - 1002,
  - 1,
  - 1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.06
  ],
- [
  - 1,
  - "Qwen-Plus",
  - "Qwen",
  - 1002,
  - 1,
  - 1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.06
  ],
- [
  - 1,
  - "Grok Code Fast 1",
  - "xAI",
  - 1002,
  - 1,
  - 1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.06
  ],
- [
  - 1,
  - "Llama 3.1 70B Instruct",
  - "Meta",
  - 1002,
  - 1,
  - 1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.06
  ],
- [
  - 1,
  - "o3 Mini High",
  - "OpenAI",
  - 1002,
  - 1,
  - 1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.06
  ],
- [
  - 1,
  - "DeepSeek V3.1 Terminus",
  - "DeepSeek",
  - 1002,
  - 1,
  - 1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.06
  ],
- [
  - 1,
  - "Qwen-Turbo",
  - "Qwen",
  - 1002,
  - 1,
  - 1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.06
  ],
- [
  - 8,
  - "Qwen VL Max",
  - "Qwen",
  - 1000.01,
  - 1,
  - 0.53,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.06
  ],
- [
  - 9,
  - "Qwen3 8B",
  - "Qwen",
  - 998,
  - 0,
  - -1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "Gemini 2.5 Pro",
  - "Google",
  - 998,
  - 0,
  - -1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "Gemini 2.5 Flash Lite",
  - "Google",
  - 998,
  - 0,
  - -1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "Nemotron Nano 9B V2",
  - "NVIDIA",
  - 998,
  - 0,
  - 0.3,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "GLM 4.5",
  - "Z.AI",
  - 998,
  - 0,
  - 0.3,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "Ministral 8B",
  - "Mistral",
  - 998,
  - 0,
  - -1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "GPT-5.1-Codex-Max",
  - "OpenAI",
  - 998,
  - 0,
  - -1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "Sonar Deep Research",
  - "Perplexity",
  - 998,
  - 0,
  - 0.3,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "GPT-5 Pro",
  - "OpenAI",
  - 998,
  - 0,
  - -1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "Mixtral 8x22B Instruct",
  - "Mistral",
  - 998,
  - 0,
  - -1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "Claude Haiku 4.5",
  - "Anthropic",
  - 998,
  - 0,
  - 0.3,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 9,
  - "GPT-4.1",
  - "OpenAI",
  - 998,
  - 0,
  - 0.3,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 21,
  - "GPT-5.1-Codex",
  - "OpenAI",
  - 997.99,
  - 0,
  - -1,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 22,
  - "GLM 4.7",
  - "Z.AI",
  - 996,
  - 0,
  - 0.3,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ],
- [
  - 22,
  - "Ministral 3 14B 2512",
  - "Mistral",
  - 996,
  - 0,
  - 0.3,
  - null,
  - 0,
  - 0,
  - 0,
  - 0.03
  ]
],
"metadata": null

}

Made with ❤️ for SWE-Model-Arena. If this work is useful to you, please consider citing our vision paper:

@inproceedings{zhao2025se,
title={SE Arena: An Interactive Platform for Evaluating Foundation Models in Software Engineering},
author={Zhao, Zhimin},
booktitle={2025 IEEE/ACM Second International Conference on AI Foundation Models and Software Engineering (Forge)},
pages={78--81},
year={2025},
organization={IEEE}
}

🏆 LLM4ASE Leaderboard

⚔️ SWE-Model-Arena

📜 How It Works

Terms of Service

📬 Submit Your Model

JSON Schema