feat: MLE Elo rating for multiple checkpoints

Also extend tournament.pair_matches to handle more than 2 groups
Contextualist · Jan 31, 2024 · 2e211e3 · 2e211e3
1 parent 55eca1e
commit 2e211e3
Show file tree

Hide file tree

Showing 7 changed files with 214 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -2,12 +2,18 @@
 
 # Lone Arena
 
-When comparing two LLM checkpoints, human evaluation could be tedious.
+You need to evaluate a few fine-tuned LLM checkpoints.
+None of the existing benchmark suite fits your domain task,
+and your content can't be reviewed by a 3rd party (e.g. GPT-4).
+Human evaluation seems to be the most viable option...
+Well, maybe it's not that bad!
+
 Let's strip down the evaluation process to just a single question:
 
 ![lone_arena-ui-en](media/lone_arena-ui-en.png)
 
 Press <kbd>f</kbd> or <kbd>j</kbd> to choose the winner of each match.
+You can make the decision, one match at a time.
 
 Inspired by [Chatbot Arena](https://chat.lmsys.org).
 
@@ -21,14 +27,35 @@ Inspired by [Chatbot Arena](https://chat.lmsys.org).
 
 ## Approach
 
-Two models/checkpoints are compared by anonymous evaluation of their responses to the same prompt. For each prompt:
+At each match, two of the models/checkpoints are compared by anonymous evaluation of their responses to the same prompt.
+Matches are shuffled.
+
+### Top 3, 1v1
 
-1. For each model, generate 8 sample responses. Run a single-elimination tournament to get top 3 responses. (8 matches x 2 models)
+`mode = "top3_1v1"` (default if there are 2 models)
+
+A simple additive scoring system.
+For each prompt:
+
+1. For each model, generate m=8 sample responses. Run a single-elimination tournament to get top 3 responses. (m matches x 2 models)
 2. Let the best responses of two models compete, then 2nd best of two models, then 3rd best. Winner of each gets 4.8, 3.2, 2.0 points, respectively. (3 matches)
 
-Matches are shuffled.
-Number of samples and points are configurable.
-In the future, I might implement [Elo](https://en.wikipedia.org/wiki/Elo_rating_system) for comparing multiple models.
+Number of samples, points, and prompt weights are configurable.
+
+### MLE Elo
+
+`mode = "mle_elo"` (default if there are 3+ models)
+
+Maximum likelihood estimate (MLE) of Elo rating is used to rank models.
+The Elo implementation is based on [Chatbot Arena's analysis notebook](https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=PbTdhkLQp113).
+For each prompt:
+
+1. For each model, generate m=16 sample responses. Eliminate half of them by pairwise comparison. (m/2 matches x n models, n ≤ m/2+1)
+2. Randomly arrange matches, with each sample response participating in only one match. (mn/4 matches)
+
+Elo rating is fitted after all matches are completed.
+Number of samples and prompt weights are configurable.
+
 
 ## Develop
 

diff --git a/evaluate.py b/evaluate.py
@@ -1,4 +1,4 @@
-from lone_arena.chatcup import Top3_1v1
+from lone_arena.chatcup import cup_factory
 from lone_arena.config import load_config, Config
 from lone_arena.files import DocumentDir, ResultDir
 from lone_arena.format import format_conversation
@@ -41,7 +41,7 @@ def compete(a, b):
             print(msg)
             req_queue.put((msg, ""))
 
-    cup = Top3_1v1(pnames_todo, mnames, conf.sample, conf.top3_scores)
+    cup = cup_factory(pnames_todo, mnames, conf)
     todo_match, total_match = cup.nmatch(), cup.nmatch(len(pnames))
     prog_notify.put((total_match - todo_match, total_match))
     itournament = cup.run(compete)

diff --git a/lone_arena/chatcup.py b/lone_arena/chatcup.py
@@ -1,9 +1,19 @@
-from .tournament import single_elimination, pair_matches, run_tournament, Player, Podium
+from .tournament import (
+    single_elimination,
+    pair_matches,
+    eliminate_half,
+    run_tournament,
+    Player,
+    Podium,
+)
+from .config import Config
 
 from attrs import define
 import pandas as pd
+import numpy as np
+from sklearn.linear_model import LogisticRegression
 
-from typing import Callable, Iterable, Protocol, TYPE_CHECKING
+from typing import Callable, Iterable, Protocol, cast
 
 
 class Cup(Protocol):
@@ -21,6 +31,20 @@ def tabulate_result(
         ...
 
 
+def cup_factory(
+    prompt_names: list[str],
+    model_names: list[str],
+    conf: Config,
+) -> Cup:
+    match conf.mode.lower():
+        case "top3_1v1":
+            return Top3_1v1(prompt_names, model_names, conf.sample, conf.top3_scores)
+        case "mle_elo":
+            return MLE_Elo(prompt_names, model_names, conf.sample)
+        case _:
+            raise ValueError(f"unknown mode: {conf.mode}")
+
+
 @define
 class Top3_1v1:
     prompt_names: list[str]
@@ -45,8 +69,7 @@ def run(
             )
             run_tournament(ta, tb, compete=compete)
 
-            top3 = list(zip(ta.podium.players, tb.podium.players))
-            tp = pair_matches(top3)
+            tp = pair_matches([ta.podium.players, tb.podium.players])
             run_tournament(tp, compete=compete)
 
             yield tp.podium
@@ -83,5 +106,75 @@ def tabulate_result(
         return pd.DataFrame(tb)
 
 
-if TYPE_CHECKING:
-    _: type[Cup] = Top3_1v1
+@define
+class MLE_Elo:
+    prompt_names: list[str]
+    model_names: list[str]
+    nplayer: int
+
+    def nmatch(self, nprompt: int | None = None) -> int:
+        nprompt = nprompt or len(self.prompt_names)
+        nmodel = len(self.model_names)
+        return nprompt * (self.nplayer // 4 * nmodel) * 3
+
+    def run(
+        self, compete: Callable[[Player, Player], tuple[Player, Player]]
+    ) -> Iterable[Podium]:
+        assert self.nplayer % 4 == 0, "expect nplayer divisible by 4"
+        for pname in self.prompt_names:
+            te = [
+                eliminate_half([(pname, mname, i) for i in range(self.nplayer)])
+                for mname in self.model_names
+            ]
+            run_tournament(*te, compete=compete)
+
+            tp = pair_matches([t.podium.players for t in te], return_loser=True)
+            run_tournament(tp, compete=compete)
+
+            yield tp.podium
+
+    def tabulate_result(
+        self, podiums: list[Podium], score_weights: list[float]
+    ) -> pd.DataFrame:
+        SCALE, BASE, INIT_RATING = 400, 10, 1000
+        nmatch_per_prompt = len(podiums[0].players) // 2
+        nentry = len(podiums) * nmatch_per_prompt
+        nmodel = len(self.model_names)
+        x = np.zeros((nentry, nmodel))
+        y = np.zeros(nentry)
+
+        tb = []
+        mname2idx = {m: i for i, m in enumerate(self.model_names)}
+        i = 0
+        for podium in podiums:
+            stat = np.zeros((nmodel, 2), dtype=int)
+            for idw, idl in zip(
+                podium.players[:nmatch_per_prompt], podium.players[nmatch_per_prompt:]
+            ):
+                j1, j2 = mname2idx[cast(tuple, idw)[1]], mname2idx[cast(tuple, idl)[1]]
+                stat[j1, 0] += 1
+                stat[j2, 1] += 1
+                if i % 2 == 0:  # let j1 be the loser
+                    j1, j2 = j2, j1
+                x[i, j1] = +np.log(BASE)
+                x[i, j2] = -np.log(BASE)
+                y[i] = i % 2
+                i += 1
+            ptag: list = podium.players
+            tb.append(
+                {
+                    "Prompt": ptag[0][0],
+                    **{m: f"+{wc}-{lc}" for m, (wc, lc) in zip(self.model_names, stat)},
+                }
+            )
+
+        lr = LogisticRegression(fit_intercept=False)
+        lr.fit(x, y, sample_weight=np.repeat(score_weights, nmatch_per_prompt))
+        elo_scores = np.round(SCALE * lr.coef_[0] + INIT_RATING)
+        tb.append(
+            {
+                "Prompt": "Elo rating",
+                **{m: s for m, s in zip(self.model_names, elo_scores)},
+            }
+        )
+        return pd.DataFrame(tb)
diff --git a/lone_arena/config.py b/lone_arena/config.py
@@ -36,6 +36,7 @@ class Prompt:
 @define
 class Config:
     data_dir: Path = Path("./data")
+    mode: str = "UNSET"
     sample: int = 8
     top3_scores: tuple[float, float, float] = (4.8, 3.2, 2.0)
     model: list[Model] = Factory(list)
@@ -44,6 +45,8 @@ class Config:
     def __attrs_post_init__(self):
         assert log2(self.sample).is_integer(), "config: sample must be power of 2"
         assert len(self.prompt) > 0, "config: expect at least 1 prompt"
+        if self.mode == "UNSET":
+            self.mode = "top3_1v1" if len(self.model) == 2 else "mle_elo"
 
 
 def load_config(fname: str) -> Config:

diff --git a/lone_arena/test_tournament.py b/lone_arena/test_tournament.py
@@ -1,6 +1,10 @@
-import pytest
 from .tournament import *
 
+import pytest
+
+from itertools import combinations
+from typing import cast
+
 
 @pytest.mark.parametrize("case", ["4", "16"])
 def test_single_elimination(case):
@@ -20,13 +24,33 @@ def test_single_elimination(case):
     assert len(t.podium.players) == 3
 
 
-def test_pair_matches():
-    players = [(0, 1), (2, 3), (4, 5), (6, 7)]
-    t = pair_matches(players)
+def test_eliminate_half():
+    t = eliminate_half(list(range(8)))
     assert len(t.init_matches) == 4
     assert len(t.podium.players) == 4
 
 
+def test_pair_matches_2x3():
+    t = pair_matches([[0, 1, 2], [3, 4, 5]])
+    assert [m.players for m in t.init_matches] == [[0, 3], [1, 4], [2, 5]]
+    assert len(t.podium.players) == 3
+
+
+@pytest.mark.parametrize("case", ["2x3", "4x8", "5x8", "6x8", "6x12"])
+def test_pair_matches(case):
+    mgroup, nplayer = map(int, case.split("x"))
+    players: list[list[Player]] = [
+        [(i, j) for j in range(nplayer)] for i in range(mgroup)
+    ]
+    t = pair_matches(players)
+    assert len(t.init_matches) == mgroup * nplayer // 2
+    all_mtypes = set(combinations(range(mgroup), 2))
+    for m in t.init_matches:
+        i, j = cast(tuple[tuple, tuple], m.players)
+        all_mtypes -= {(i[0], j[0])}
+    assert not all_mtypes
+
+
 def test_run_tournament():
     def compete(p1, p2):
         if p1 < p2:

diff --git a/lone_arena/tournament.py b/lone_arena/tournament.py
@@ -5,6 +5,7 @@
 import random
 from pathlib import Path
 import json
+from itertools import combinations
 from typing import Self, Callable, Protocol
 from collections.abc import Hashable
 
@@ -101,9 +102,51 @@ def single_elimination(players: list[Player]) -> Tournament:
     return Tournament(leaves, top3)
 
 
-def pair_matches(players: list[tuple[Player, Player]]) -> Tournament:
-    assert all(len(p) == 2 for p in players), "expect n pairs"
+def eliminate_half(players: list[Player]) -> Tournament:
     n = len(players)
-    winners = Podium.for_(n)
-    matches = [Match((winners, i), None, list(p)) for i, p in enumerate(players)]
+    assert n % 2 == 0, "expect even number of players"
+
+    winners = Podium.for_(n // 2)
+    matches = [
+        Match((winners, i), None, list(p)) for i, p in enumerate(batched(players, 2))
+    ]
     return Tournament(matches, winners)
+
+
+def pair_matches(
+    players: list[list[Player]], *, return_loser: bool = False
+) -> Tournament:
+    players = [list(p[::-1]) for p in players]  # reverse & clone
+    mgroup = len(players)
+    nplayer = len(players[0])
+    n_match = mgroup * nplayer // 2
+
+    p = Podium.for_(n_match * 2 if return_loser else n_match)
+    matches = []
+    # regular pairs
+    idx = 0
+    n_match_type = mgroup * (mgroup - 1) // 2
+    for i, j in combinations(range(mgroup), 2):
+        for _ in range(n_match // n_match_type):
+            w_to = (p, idx)
+            l_to = (p, idx + n_match) if return_loser else None
+            matches.append(Match(w_to, l_to, [players[i].pop(), players[j].pop()]))
+            idx += 1
+    # remaining pairs
+    i, j = 0, 1
+    while idx < n_match:
+        w_to = (p, idx)
+        l_to = (p, idx + n_match) if return_loser else None
+        while not players[i]:
+            i = (i + 1) % mgroup
+        pi = players[i].pop()
+        while not players[j]:
+            j = (j + 1) % mgroup
+        pj = players[j].pop()
+        if i == j:
+            raise RuntimeError("a remaining pair is from the same group")
+        matches.append(Match(w_to, l_to, [pi, pj]))
+        idx += 1
+        i, j = (i + 1) % mgroup, (j + 1) % mgroup
+
+    return Tournament(matches, p)
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,6 @@ openai
 attrs
 cattrs
 pandas
+numpy
+scikit-learn
 tqdm
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,4 +3,6 @@ openai @@
     attrs
     cattrs
     pandas
+    numpy
+    scikit-learn
     tqdm