🏆 LLM4ASE Leaderboard

Community-driven evaluation of LLMs on real agentic coding tasks, powered by opencode

SWE-Model-Arena pits LLMs head-to-head in blind agentic coding comparisons. Each model drives opencode to read files, write code, runs commands, and produces real git diffs — identical scaffold, different brain. Community votes determine the rankings. For technical details, check out our paper.

{
  • "headers": [
    • "Rank",
    • "Model",
    • "Organization",
    • "Elo Score",
    • "Win Rate",
    • "Conversation Efficiency Index",
    • "Conversation Consistency Index",
    • "Bradley-Terry Coefficient",
    • "Eigenvector Centrality Value",
    • "Newman Modularity Score",
    • "PageRank Score"
    ],
  • "data": [
    • [
      • 1,
      • "MiMo-V2-Flash",
      • "Xiaomi",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.06
      ],
    • [
      • 1,
      • "Qwen-Plus",
      • "Qwen",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.06
      ],
    • [
      • 1,
      • "Grok Code Fast 1",
      • "xAI",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.06
      ],
    • [
      • 1,
      • "Llama 3.1 70B Instruct",
      • "Meta",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.06
      ],
    • [
      • 1,
      • "o3 Mini High",
      • "OpenAI",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.06
      ],
    • [
      • 1,
      • "DeepSeek V3.1 Terminus",
      • "DeepSeek",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.06
      ],
    • [
      • 1,
      • "Qwen-Turbo",
      • "Qwen",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.06
      ],
    • [
      • 8,
      • "Qwen VL Max",
      • "Qwen",
      • 1000.01,
      • 1,
      • 0.53,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.06
      ],
    • [
      • 9,
      • "Qwen3 8B",
      • "Qwen",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "Gemini 2.5 Pro",
      • "Google",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "Gemini 2.5 Flash Lite",
      • "Google",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "Nemotron Nano 9B V2",
      • "NVIDIA",
      • 998,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "GLM 4.5",
      • "Z.AI",
      • 998,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "Ministral 8B",
      • "Mistral",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "GPT-5.1-Codex-Max",
      • "OpenAI",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "Sonar Deep Research",
      • "Perplexity",
      • 998,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "GPT-5 Pro",
      • "OpenAI",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "Mixtral 8x22B Instruct",
      • "Mistral",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "Claude Haiku 4.5",
      • "Anthropic",
      • 998,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 9,
      • "GPT-4.1",
      • "OpenAI",
      • 998,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 21,
      • "GPT-5.1-Codex",
      • "OpenAI",
      • 997.99,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 22,
      • "GLM 4.7",
      • "Z.AI",
      • 996,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 22,
      • "Ministral 3 14B 2512",
      • "Mistral",
      • 996,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ]
    ],
  • "metadata": null
}

Made with ❤️ for SWE-Model-Arena. If this work is useful to you, please consider citing our vision paper:

@inproceedings{zhao2025se,
title={SE Arena: An Interactive Platform for Evaluating Foundation Models in Software Engineering},
author={Zhao, Zhimin},
booktitle={2025 IEEE/ACM Second International Conference on AI Foundation Models and Software Engineering (Forge)},
pages={78--81},
year={2025},
organization={IEEE}
}