LaMDA rlhf pytorch
v0.0.2
PyTorch에서 Google LaMDA 연구 논문의 오픈소스 사전 학습 구현입니다. 전혀 지각이 없는 AI. 이 저장소는 사전 훈련 아키텍처의 2B 매개변수 구현을 다룰 것입니다. 이는 대부분의 사람들이 훈련할 수 있는 수준이기 때문입니다. 여기에서 LaMDA를 자세히 설명하는 2022년 Google의 최신 블로그 게시물을 검토할 수 있습니다. 여기에서 모델에 대한 2021년 이전 블로그 게시물을 볼 수도 있습니다.
저는 Phil 'Lucid' Wang 박사의 작업에서 큰 영감을 받았습니다. 다양한 변환기 아키텍처에 대한 그의 오픈 소스 구현을 확인하고 그의 작업을 지원하십시오.
개발자 업데이트는 다음에서 찾을 수 있습니다.
lamda_base = LaMDA (
num_tokens = 20000 ,
dim = 512 ,
dim_head = 64 ,
depth = 12 ,
heads = 8
)
lamda = AutoregressiveWrapper ( lamda_base , max_seq_len = 512 )
tokens = torch . randint ( 0 , 20000 , ( 1 , 512 )) # mock token data
logits = lamda ( tokens )
print ( logits )
@article { DBLP:journals/corr/abs-2201-08239 ,
author = { Romal Thoppilan and
Daniel De Freitas and
Jamie Hall and
Noam Shazeer and
Apoorv Kulshreshtha and
Heng{-}Tze Cheng and
Alicia Jin and
Taylor Bos and
Leslie Baker and
Yu Du and
YaGuang Li and
Hongrae Lee and
Huaixiu Steven Zheng and
Amin Ghafouri and
Marcelo Menegali and
Yanping Huang and
Maxim Krikun and
Dmitry Lepikhin and
James Qin and
Dehao Chen and
Yuanzhong Xu and
Zhifeng Chen and
Adam Roberts and
Maarten Bosma and
Yanqi Zhou and
Chung{-}Ching Chang and
Igor Krivokon and
Will Rusch and
Marc Pickett and
Kathleen S. Meier{-}Hellstern and
Meredith Ringel Morris and
Tulsee Doshi and
Renelito Delos Santos and
Toju Duke and
Johnny Soraker and
Ben Zevenbergen and
Vinodkumar Prabhakaran and
Mark Diaz and
Ben Hutchinson and
Kristen Olson and
Alejandra Molina and
Erin Hoffman{-}John and
Josh Lee and
Lora Aroyo and
Ravi Rajakumar and
Alena Butryna and
Matthew Lamm and
Viktoriya Kuzmina and
Joe Fenton and
Aaron Cohen and
Rachel Bernstein and
Ray Kurzweil and
Blaise Aguera{-}Arcas and
Claire Cui and
Marian Croak and
Ed H. Chi and
Quoc Le } ,
title = { LaMDA: Language Models for Dialog Applications } ,
journal = { CoRR } ,
volume = { abs/2201.08239 } ,
year = { 2022 } ,
url = { https://arxiv.org/abs/2201.08239 } ,
eprinttype = { arXiv } ,
eprint = { 2201.08239 } ,
timestamp = { Fri, 22 Apr 2022 16:06:31 +0200 } ,
biburl = { https://dblp.org/rec/journals/corr/abs-2201-08239.bib } ,
bibsource = { dblp computer science bibliography, https://dblp.org }
}
@misc { https://doi.org/10.48550/arxiv.1706.03762 ,
doi = { 10.48550/ARXIV.1706.03762 } ,
url = { https://arxiv.org/abs/1706.03762 } ,
author = { Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia } ,
keywords = { Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences } ,
title = { Attention Is All You Need } ,
publisher = { arXiv } ,
year = { 2017 } ,
copyright = { arXiv.org perpetual, non-exclusive license }
}
@misc { https://doi.org/10.48550/arxiv.1910.10683 ,
doi = { 10.48550/ARXIV.1910.10683 } ,
url = { https://arxiv.org/abs/1910.10683 } ,
author = { Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J. } ,
keywords = { Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences } ,
title = { Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer } ,
publisher = { arXiv } ,
year = { 2019 } ,
copyright = { arXiv.org perpetual, non-exclusive license }
}
@misc { https://doi.org/10.48550/arxiv.2002.05202 ,
doi = { 10.48550/ARXIV.2002.05202 } ,
url = { https://arxiv.org/abs/2002.05202 } ,
author = { Shazeer, Noam } ,
keywords = { Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences } ,
title = { GLU Variants Improve Transformer } ,
publisher = { arXiv } ,
year = { 2020 } ,
copyright = { arXiv.org perpetual, non-exclusive license }
}
@article { DBLP:journals/corr/abs-2101-00027 ,
author = { Leo Gao and
Stella Biderman and
Sid Black and
Laurence Golding and
Travis Hoppe and
Charles Foster and
Jason Phang and
Horace He and
Anish Thite and
Noa Nabeshima and
Shawn Presser and
Connor Leahy } ,
title = { The Pile: An 800GB Dataset of Diverse Text for Language Modeling } ,
journal = { CoRR } ,
volume = { abs/2101.00027 } ,
year = { 2021 } ,
url = { https://arxiv.org/abs/2101.00027 } ,
eprinttype = { arXiv } ,
eprint = { 2101.00027 } ,
timestamp = { Thu, 14 Oct 2021 09:16:12 +0200 } ,
biburl = { https://dblp.org/rec/journals/corr/abs-2101-00027.bib } ,
bibsource = { dblp computer science bibliography, https://dblp.org }
}
@article { DBLP:journals/corr/abs-1808-06226 ,
author = { Taku Kudo and
John Richardson } ,
title = { SentencePiece: {A} simple and language independent subword tokenizer
and detokenizer for Neural Text Processing } ,
journal = { CoRR } ,
volume = { abs/1808.06226 } ,
year = { 2018 } ,
url = { http://arxiv.org/abs/1808.06226 } ,
eprinttype = { arXiv } ,
eprint = { 1808.06226 } ,
timestamp = { Sun, 02 Sep 2018 15:01:56 +0200 } ,
biburl = { https://dblp.org/rec/journals/corr/abs-1808-06226.bib } ,
bibsource = { dblp computer science bibliography, https://dblp.org }
}
@inproceedings { sennrich-etal-2016-neural ,
title = " Neural Machine Translation of Rare Words with Subword Units " ,
author = " Sennrich, Rico and
Haddow, Barry and
Birch, Alexandra " ,
booktitle = " Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) " ,
month = aug,
year = " 2016 " ,
address = " Berlin, Germany " ,
publisher = " Association for Computational Linguistics " ,
url = " https://aclanthology.org/P16-1162 " ,
doi = " 10.18653/v1/P16-1162 " ,
pages = " 1715--1725 " ,
}