LaMDA rlhf pytorch
v0.0.2
Implementação de pré-treinamento de código aberto do artigo de pesquisa LaMDA do Google em PyTorch. A IA totalmente insensível. Este repositório cobrirá a implementação do parâmetro 2B da arquitetura de pré-treinamento, pois é provavelmente o que a maioria pode treinar. Você pode revisar a última postagem do blog do Google de 2022, que detalha o LaMDA aqui. Você também pode ver a postagem anterior do blog de 2021 sobre o modelo aqui.
Fui muito inspirado pelo trabalho do Dr. Phil 'Lucid' Wang. Confira suas implementações de código aberto de várias arquiteturas de transformadores diferentes e apoie seu trabalho.
As atualizações do desenvolvedor podem ser encontradas em:
lamda_base = LaMDA (
num_tokens = 20000 ,
dim = 512 ,
dim_head = 64 ,
depth = 12 ,
heads = 8
)
lamda = AutoregressiveWrapper ( lamda_base , max_seq_len = 512 )
tokens = torch . randint ( 0 , 20000 , ( 1 , 512 )) # mock token data
logits = lamda ( tokens )
print ( logits )
@article { DBLP:journals/corr/abs-2201-08239 ,
author = { Romal Thoppilan and
Daniel De Freitas and
Jamie Hall and
Noam Shazeer and
Apoorv Kulshreshtha and
Heng{-}Tze Cheng and
Alicia Jin and
Taylor Bos and
Leslie Baker and
Yu Du and
YaGuang Li and
Hongrae Lee and
Huaixiu Steven Zheng and
Amin Ghafouri and
Marcelo Menegali and
Yanping Huang and
Maxim Krikun and
Dmitry Lepikhin and
James Qin and
Dehao Chen and
Yuanzhong Xu and
Zhifeng Chen and
Adam Roberts and
Maarten Bosma and
Yanqi Zhou and
Chung{-}Ching Chang and
Igor Krivokon and
Will Rusch and
Marc Pickett and
Kathleen S. Meier{-}Hellstern and
Meredith Ringel Morris and
Tulsee Doshi and
Renelito Delos Santos and
Toju Duke and
Johnny Soraker and
Ben Zevenbergen and
Vinodkumar Prabhakaran and
Mark Diaz and
Ben Hutchinson and
Kristen Olson and
Alejandra Molina and
Erin Hoffman{-}John and
Josh Lee and
Lora Aroyo and
Ravi Rajakumar and
Alena Butryna and
Matthew Lamm and
Viktoriya Kuzmina and
Joe Fenton and
Aaron Cohen and
Rachel Bernstein and
Ray Kurzweil and
Blaise Aguera{-}Arcas and
Claire Cui and
Marian Croak and
Ed H. Chi and
Quoc Le } ,
title = { LaMDA: Language Models for Dialog Applications } ,
journal = { CoRR } ,
volume = { abs/2201.08239 } ,
year = { 2022 } ,
url = { https://arxiv.org/abs/2201.08239 } ,
eprinttype = { arXiv } ,
eprint = { 2201.08239 } ,
timestamp = { Fri, 22 Apr 2022 16:06:31 +0200 } ,
biburl = { https://dblp.org/rec/journals/corr/abs-2201-08239.bib } ,
bibsource = { dblp computer science bibliography, https://dblp.org }
}
@misc { https://doi.org/10.48550/arxiv.1706.03762 ,
doi = { 10.48550/ARXIV.1706.03762 } ,
url = { https://arxiv.org/abs/1706.03762 } ,
author = { Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia } ,
keywords = { Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences } ,
title = { Attention Is All You Need } ,
publisher = { arXiv } ,
year = { 2017 } ,
copyright = { arXiv.org perpetual, non-exclusive license }
}
@misc { https://doi.org/10.48550/arxiv.1910.10683 ,
doi = { 10.48550/ARXIV.1910.10683 } ,
url = { https://arxiv.org/abs/1910.10683 } ,
author = { Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J. } ,
keywords = { Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences } ,
title = { Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer } ,
publisher = { arXiv } ,
year = { 2019 } ,
copyright = { arXiv.org perpetual, non-exclusive license }
}
@misc { https://doi.org/10.48550/arxiv.2002.05202 ,
doi = { 10.48550/ARXIV.2002.05202 } ,
url = { https://arxiv.org/abs/2002.05202 } ,
author = { Shazeer, Noam } ,
keywords = { Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences } ,
title = { GLU Variants Improve Transformer } ,
publisher = { arXiv } ,
year = { 2020 } ,
copyright = { arXiv.org perpetual, non-exclusive license }
}
@article { DBLP:journals/corr/abs-2101-00027 ,
author = { Leo Gao and
Stella Biderman and
Sid Black and
Laurence Golding and
Travis Hoppe and
Charles Foster and
Jason Phang and
Horace He and
Anish Thite and
Noa Nabeshima and
Shawn Presser and
Connor Leahy } ,
title = { The Pile: An 800GB Dataset of Diverse Text for Language Modeling } ,
journal = { CoRR } ,
volume = { abs/2101.00027 } ,
year = { 2021 } ,
url = { https://arxiv.org/abs/2101.00027 } ,
eprinttype = { arXiv } ,
eprint = { 2101.00027 } ,
timestamp = { Thu, 14 Oct 2021 09:16:12 +0200 } ,
biburl = { https://dblp.org/rec/journals/corr/abs-2101-00027.bib } ,
bibsource = { dblp computer science bibliography, https://dblp.org }
}
@article { DBLP:journals/corr/abs-1808-06226 ,
author = { Taku Kudo and
John Richardson } ,
title = { SentencePiece: {A} simple and language independent subword tokenizer
and detokenizer for Neural Text Processing } ,
journal = { CoRR } ,
volume = { abs/1808.06226 } ,
year = { 2018 } ,
url = { http://arxiv.org/abs/1808.06226 } ,
eprinttype = { arXiv } ,
eprint = { 1808.06226 } ,
timestamp = { Sun, 02 Sep 2018 15:01:56 +0200 } ,
biburl = { https://dblp.org/rec/journals/corr/abs-1808-06226.bib } ,
bibsource = { dblp computer science bibliography, https://dblp.org }
}
@inproceedings { sennrich-etal-2016-neural ,
title = " Neural Machine Translation of Rare Words with Subword Units " ,
author = " Sennrich, Rico and
Haddow, Barry and
Birch, Alexandra " ,
booktitle = " Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) " ,
month = aug,
year = " 2016 " ,
address = " Berlin, Germany " ,
publisher = " Association for Computational Linguistics " ,
url = " https://aclanthology.org/P16-1162 " ,
doi = " 10.18653/v1/P16-1162 " ,
pages = " 1715--1725 " ,
}