soundstorm pytorch下載 - soundstorm pytorch原始碼下載

聲風暴 - Pytorch

在 Pytorch 中實現 SoundStorm（來自 Google Deepmind 的高效並行音訊生成）。

他們基本上將 MaskGiT 應用於來自 Soundstream 的殘差向量量化程式碼。他們選擇使用的變壓器架構非常適合音訊領域，名為 Conformer

專案頁面

欣賞

穩定性和？ Huggingface 慷慨贊助尖端人工智慧研究並開源
Lucas Newman 做出了許多貢獻，包括初始訓練程式碼、聲音提示邏輯、每級量化器解碼！
？加速提供簡單且強大的培訓解決方案
Einops 提供不可或缺的抽象，使建構神經網路變得有趣、簡單且令人振奮
Steven Hillis 提交了正確的屏蔽策略並驗證了儲存庫是否有效！
Lucas Newman 使用跨多個儲存庫的模型對一個小型工作 Soundstorm 進行了基本訓練，展示了它的端到端工作原理。模型包括 SoundStream、文字到語意 T5，最後是 SoundStorm 轉換器。
@Jiang-Stan，發現了迭代解密中的一個關鍵錯誤！

安裝

$ pip install soundstorm-pytorch

用法

 import torch
from soundstorm_pytorch import SoundStorm , ConformerWrapper

conformer = ConformerWrapper (
    codebook_size = 1024 ,
    num_quantizers = 12 ,
    conformer = dict (
        dim = 512 ,
        depth = 2
    ),
)

model = SoundStorm (
    conformer ,
    steps = 18 ,          # 18 steps, as in original maskgit paper
    schedule = 'cosine'  # currently the best schedule is cosine
)

# get your pre-encoded codebook ids from the soundstream from a lot of raw audio

codes = torch . randint ( 0 , 1024 , ( 2 , 1024 , 12 )) # (batch, seq, num residual VQ)

# do the below in a loop for a ton of data

loss , _ = model ( codes )
loss . backward ()

# model can now generate in 18 steps. ~2 seconds sounds reasonable

generated = model . generate ( 1024 , batch_size = 2 ) # (2, 1024)

要直接訓練原始音頻，您需要將預先訓練的SoundStream傳遞到SoundStorm中。您可以在 audiolm-pytorch 上訓練您自己的SoundStream 。

 import torch
from soundstorm_pytorch import SoundStorm , ConformerWrapper , Conformer , SoundStream

conformer = ConformerWrapper (
    codebook_size = 1024 ,
    num_quantizers = 12 ,
    conformer = dict (
        dim = 512 ,
        depth = 2
    ),
)

soundstream = SoundStream (
    codebook_size = 1024 ,
    rq_num_quantizers = 12 ,
    attn_window_size = 128 ,
    attn_depth = 2
)

model = SoundStorm (
    conformer ,
    soundstream = soundstream   # pass in the soundstream
)

# find as much audio you'd like the model to learn

audio = torch . randn ( 2 , 10080 )

# course it through the model and take a gazillion tiny steps

loss , _ = model ( audio )
loss . backward ()

# and now you can generate state-of-the-art speech

generated_audio = model . generate ( seconds = 30 , batch_size = 2 )  # generate 30 seconds of audio (it will calculate the length in seconds based off the sampling frequency and cumulative downsamples in the soundstream passed in above)

完整的文字轉語音將依賴經過訓練的TextToSemantic編碼器/解碼器轉換器。然後，您將載入權重並將其作為spear_tts_text_to_semantic傳遞到SoundStorm

這是一項正在進行的工作，因為spear-tts-pytorch僅具有完整的模型架構，而不是預訓練 + 偽標籤 + 反向翻譯邏輯。

 from spear_tts_pytorch import TextToSemantic

text_to_semantic = TextToSemantic (
    dim = 512 ,
    source_depth = 12 ,
    target_depth = 12 ,
    num_text_token_ids = 50000 ,
    num_semantic_token_ids = 20000 ,
    use_openai_tokenizer = True
)

# load the trained text-to-semantic transformer

text_to_semantic . load ( '/path/to/trained/model.pt' )

# pass it into the soundstorm

model = SoundStorm (
    conformer ,
    soundstream = soundstream ,
    spear_tts_text_to_semantic = text_to_semantic
). cuda ()

# and now you can generate state-of-the-art speech

generated_speech = model . generate (
    texts = [
        'the rain in spain stays mainly in the plain' ,
        'the quick brown fox jumps over the lazy dog'
    ]
) # (2, n) - raw waveform decoded from soundstream

托多

引文

 @misc { borsos2023soundstorm ,
    title   = { SoundStorm: Efficient Parallel Audio Generation } , 
    author  = { Zalán Borsos and Matt Sharifi and Damien Vincent and Eugene Kharitonov and Neil Zeghidour and Marco Tagliasacchi } ,
    year    = { 2023 } ,
    eprint  = { 2305.09636 } ,
    archivePrefix = { arXiv } ,
    primaryClass = { cs.SD }
}

 @inproceedings { dao2022flashattention ,
    title   = { Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness } ,
    author  = { Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{'e}, Christopher } ,
    booktitle = { Advances in Neural Information Processing Systems } ,
    year    = { 2022 }
}

 @article { Chang2022MaskGITMG ,
    title   = { MaskGIT: Masked Generative Image Transformer } ,
    author  = { Huiwen Chang and Han Zhang and Lu Jiang and Ce Liu and William T. Freeman } ,
    journal = { 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) } ,
    year    = { 2022 } ,
    pages   = { 11305-11315 }
}

 @article { Lezama2022ImprovedMI ,
    title   = { Improved Masked Image Generation with Token-Critic } ,
    author  = { Jos{'e} Lezama and Huiwen Chang and Lu Jiang and Irfan Essa } ,
    journal = { ArXiv } ,
    year    = { 2022 } ,
    volume  = { abs/2209.04439 }
}

 @inproceedings { Nijkamp2021SCRIPTSP ,
    title   = { SCRIPT: Self-Critic PreTraining of Transformers } ,
    author  = { Erik Nijkamp and Bo Pang and Ying Nian Wu and Caiming Xiong } ,
    booktitle = { North American Chapter of the Association for Computational Linguistics } ,
    year    = { 2021 }
}

 @inproceedings { rogozhnikov2022einops ,
    title   = { Einops: Clear and Reliable Tensor Manipulations with Einstein-like Notation } ,
    author  = { Alex Rogozhnikov } ,
    booktitle = { International Conference on Learning Representations } ,
    year    = { 2022 } ,
    url     = { https://openreview.net/forum?id=oapKSVM2bcj }
}

 @misc { su2021roformer ,
    title   = { RoFormer: Enhanced Transformer with Rotary Position Embedding } ,
    author  = { Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu } ,
    year    = { 2021 } ,
    eprint  = { 2104.09864 } ,
    archivePrefix = { arXiv } ,
    primaryClass = { cs.CL }
}

 @inproceedings { Zhou2024ValueRL ,
    title   = { Value Residual Learning For Alleviating Attention Concentration In Transformers } ,
    author  = { Zhanchao Zhou and Tianyi Wu and Zhiyun Jiang and Zhenzhong Lan } ,
    year    = { 2024 } ,
    url     = { https://api.semanticscholar.org/CorpusID:273532030 }
}