self rewarding lm pytorch 다운로드 - self rewarding lm pytorch 소스 코드 다운로드

자기 보상 언어 모델

MetaAI의 Self-Rewarding Language Model에서 제안된 훈련 프레임워크 구현

그들은 DPO 논문의 제목을 마음에 새겼습니다.

이 라이브러리에는 Nous Research의 Teknium이 낙관론을 표명한 SPIN 구현도 포함되어 있습니다.

감사

A16Z 오픈 소스 AI 보조금 프로그램 및? 현재 인공 지능 연구를 오픈 소스로 독립시킬 수 있도록 아낌없는 후원과 다른 후원자들에게 포옹을 전합니다.

설치하다

$ pip install self-rewarding-lm-pytorch

용법

 import torch
from torch import Tensor

from self_rewarding_lm_pytorch import (
    SelfRewardingTrainer ,
    create_mock_dataset
)

from x_transformers import TransformerWrapper , Decoder

transformer = TransformerWrapper (
    num_tokens = 256 ,
    max_seq_len = 1024 ,
    attn_layers = Decoder (
        dim = 512 ,
        depth = 1 ,
        heads = 8
    )
)

sft_dataset = create_mock_dataset ( 100 , lambda : ( torch . randint ( 0 , 256 , ( 256 ,)), torch . tensor ( 1 )))
prompt_dataset = create_mock_dataset ( 100 , lambda : 'mock prompt' )

def decode_tokens ( tokens : Tensor ) -> str :
    decode_token = lambda token : str ( chr ( max ( 32 , token )))
    return '' . join ( list ( map ( decode_token , tokens )))

def encode_str ( seq_str : str ) -> Tensor :
    return Tensor ( list ( map ( ord , seq_str )))

trainer = SelfRewardingTrainer (
    transformer ,
    finetune_configs = dict (
        train_sft_dataset = sft_dataset ,
        self_reward_prompt_dataset = prompt_dataset ,
        dpo_num_train_steps = 1000
    ),
    tokenizer_decode = decode_tokens ,
    tokenizer_encode = encode_str ,
    accelerate_kwargs = dict (
        cpu = True
    )
)

trainer ( overwrite_checkpoints = True )

# checkpoints after each finetuning stage will be saved to ./checkpoints

SPIN은 다음과 같이 교육할 수 있습니다. Readme의 마지막 예에 표시된 대로 미세 조정 파이프라인에 추가할 수도 있습니다.

 import torch

from self_rewarding_lm_pytorch import (
    SPINTrainer ,
    create_mock_dataset
)

from x_transformers import TransformerWrapper , Decoder

transformer = TransformerWrapper (
    num_tokens = 256 ,
    max_seq_len = 1024 ,
    attn_layers = Decoder (
        dim = 512 ,
        depth = 6 ,
        heads = 8
    )
)

sft_dataset = create_mock_dataset ( 100 , lambda : ( torch . randint ( 0 , 256 , ( 256 ,)), torch . tensor ( 1 )))

spin_trainer = SPINTrainer (
    transformer ,
    max_seq_len = 16 ,
    train_sft_dataset = sft_dataset ,
    checkpoint_every = 100 ,
    spin_kwargs = dict (
        λ = 0.1 ,
    ),
)

spin_trainer ()

LLM 판사가 아닌 자신만의 보상 프롬프트를 실험하고 싶다고 가정해 보겠습니다. 먼저 RewardConfig 가져와야 합니다. 다음으로 이를 reward_prompt_config 로 트레이너에 전달해야 합니다.

 # first import

from self_rewarding_lm_pytorch import RewardConfig

# then say you want to try asking the transformer nicely

# reward_regex_template is the string that will be looked for in the LLM response, for parsing out the reward where {{ reward }} is defined as a number

trainer = SelfRewardingTrainer (
    transformer ,
    ...,
    self_reward_prompt_config = RewardConfig (
        prompt_template = """
        Pretty please rate the following user prompt and response
        User: {{ prompt }}
        Response: {{ response }}

        Format your score as follows:
        Rating: <rating as integer from 0 - 10>
        """ ,
        reward_regex_template = """
        Rating: {{ reward }}
        """
    )
)

마지막으로, 임의의 순서로 미세 조정을 실험하려는 경우 FinetuneConfig 인스턴스를 목록으로 finetune_configs 에 전달하면 유연성도 확보할 수 있습니다.

전. 인터리빙 SPIN, 외부 보상, 자기 보상에 대한 연구를 수행하고 싶다고 가정해 보겠습니다.

이 아이디어는 개인 디스코드 채널의 Teknium에서 시작되었습니다.

 # import the configs

from self_rewarding_lm_pytorch import (
    SFTConfig ,
    SelfRewardDPOConfig ,
    ExternalRewardDPOConfig ,
    SelfPlayConfig ,
)

trainer = SelfRewardingTrainer (
    model ,
    finetune_configs = [
        SFTConfig (...),
        SelfPlayConfig (...),
        ExternalRewardDPOConfig (...),
        SelfRewardDPOConfig (...),
        SelfPlayConfig (...),
        SelfRewardDPOConfig (...)
    ],
    ...
)

trainer ()

# checkpoints after each finetuning stage will be saved to ./checkpoints

토도

소환

 @misc { yuan2024selfrewarding ,
    title   = { Self-Rewarding Language Models } , 
    author  = { Weizhe Yuan and Richard Yuanzhe Pang and Kyunghyun Cho and Sainbayar Sukhbaatar and Jing Xu and Jason Weston } ,
    year    = { 2024 } ,
    eprint  = { 2401.10020 } ,
    archivePrefix = { arXiv } ,
    primaryClass = { cs.CL }
}

 @article { Chen2024SelfPlayFC ,
    title   = { Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models } ,
    author  = { Zixiang Chen and Yihe Deng and Huizhuo Yuan and Kaixuan Ji and Quanquan Gu } ,
    journal = { ArXiv } ,
    year    = { 2024 } ,
    volume  = { abs/2401.01335 } ,
    url     = { https://api.semanticscholar.org/CorpusID:266725672 }
}

 @article { Rafailov2023DirectPO ,
    title   = { Direct Preference Optimization: Your Language Model is Secretly a Reward Model } ,
    author  = { Rafael Rafailov and Archit Sharma and Eric Mitchell and Stefano Ermon and Christopher D. Manning and Chelsea Finn } ,
    journal = { ArXiv } ,
    year    = { 2023 } ,
    volume  = { abs/2305.18290 } ,
    url     = { https://api.semanticscholar.org/CorpusID:258959321 }
}

 @inproceedings { Guo2024DirectLM ,
    title   = { Direct Language Model Alignment from Online AI Feedback } ,
    author  = { Shangmin Guo and Biao Zhang and Tianlin Liu and Tianqi Liu and Misha Khalman and Felipe Llinares and Alexandre Rame and Thomas Mesnard and Yao Zhao and Bilal Piot and Johan Ferret and Mathieu Blondel } ,
    year    = { 2024 } ,
    url     = { https://api.semanticscholar.org/CorpusID:267522951 }
}