train-llm-from-scratch/src/post_training/reward_train.py at main · FareedKhan-dev/train-llm-from-scratch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
"""
Reward-model training objective: the Bradley-Terry pairwise loss used by InstructGPT.

Given scalar rewards for a preferred (``chosen``) and dispreferred (``rejected``) response
to the same prompt, the model is trained so the chosen reward exceeds the rejected one:

    L = -log sigmoid(r_chosen - r_rejected)

Preference accuracy (fraction with r_chosen > r_rejected) is the headline eval metric.
"""

from __future__ import annotations

import torch
import torch.nn.functional as F


def bradley_terry_loss(chosen_rewards: torch.Tensor, rejected_rewards: torch.Tensor) -> torch.Tensor:
    """Mean ``-log sigmoid(chosen - rejected)`` over a batch of preference pairs."""
    return -F.logsigmoid(chosen_rewards - rejected_rewards).mean()


def preference_accuracy(chosen_rewards: torch.Tensor, rejected_rewards: torch.Tensor) -> torch.Tensor:
    """Fraction of pairs where the model scores the chosen response higher."""
    return (chosen_rewards > rejected_rewards).float().mean()


def reward_margin(chosen_rewards: torch.Tensor, rejected_rewards: torch.Tensor) -> torch.Tensor:
    """Mean reward gap (chosen - rejected); a useful training diagnostic."""
    return (chosen_rewards - rejected_rewards).mean()