naive.py 2.81 KB
Newer Older
Fazzie-Maqianli's avatar
Fazzie-Maqianli committed
1
import torch
2
import torch.nn.functional as F
3
from coati.models.base import Actor, Critic, RewardModel
4
5
from coati.models.generation import generate
from coati.models.utils import calc_action_log_probs, compute_reward
6
from transformers import PreTrainedTokenizer
Fazzie-Maqianli's avatar
Fazzie-Maqianli committed
7
8
9
10
11
12
13
14
15

from .base import Experience, ExperienceMaker


class NaiveExperienceMaker(ExperienceMaker):
    """
    Naive experience maker.
    """

16
17
18
19
20
21
22
23
24
25
26
27
28
    def __init__(
        self,
        actor: Actor,
        critic: Critic,
        reward_model: RewardModel,
        initial_model: Actor,
        tokenizer: PreTrainedTokenizer,
        kl_coef: float = 0.1,
    ) -> None:
        super().__init__(actor, critic, reward_model, initial_model)
        self.tokenizer = tokenizer
        self.kl_coef = kl_coef

Fazzie-Maqianli's avatar
Fazzie-Maqianli committed
29
30
31
32
33
34
35
    @torch.no_grad()
    def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experience:
        self.actor.eval()
        self.critic.eval()
        self.initial_model.eval()
        self.reward_model.eval()

36
        # generate sequences
37
        sequences = generate(self.actor, input_ids, self.tokenizer, **generate_kwargs)
38
39
40

        # calculate auxiliary tensors
        attention_mask = None
41
        pad_token_id = self.tokenizer.pad_token_id
42
        if pad_token_id is not None:
43
            attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
44
45

        input_len = input_ids.size(1)
46
        eos_token_id = self.tokenizer.eos_token_id
47
48
49
50
51
        if eos_token_id is None:
            action_mask = torch.ones_like(sequences, dtype=torch.bool)
        else:
            # left padding may be applied, only mask action
            action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
52
            action_mask = F.pad(action_mask, (1 + input_len, -1), value=True)  # include eos token and input
53
54
        action_mask[:, :input_len] = False
        action_mask = action_mask[:, 1:]
55
        action_mask = action_mask[:, -(sequences.size(1) - input_len) :]
Fazzie-Maqianli's avatar
Fazzie-Maqianli committed
56
57
        num_actions = action_mask.size(1)

58
        actor_output = self.actor(sequences, attention_mask)["logits"]
59
        action_log_probs = calc_action_log_probs(actor_output, sequences, num_actions)
60
        base_model_output = self.initial_model(sequences, attention_mask)["logits"]
61
        base_action_log_probs = calc_action_log_probs(base_model_output, sequences, num_actions)
62
        value = self.critic(sequences, attention_mask)
Fazzie-Maqianli's avatar
Fazzie-Maqianli committed
63
64
65
66
67
68
69
70
71
        r = self.reward_model(sequences, attention_mask)
        reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)

        advantage = reward - value
        # TODO(ver217): maybe normalize adv
        if advantage.ndim == 1:
            advantage = advantage.unsqueeze(-1)

        return Experience(sequences, action_log_probs, value, reward, advantage, attention_mask, action_mask)