ZeroPromptSearch下載ZeroPromptSearch源代碼下載

零射擊提示進行階躍分解和搜索

這更多是一個實驗/研究項目。它實現了一條提示管道與包裝器相結合的，用於自動分解步驟，並通過“踩踏空間”（例如，通過梁搜索，MCT等）進行搜索，以自我評估為指導。

學分：

一些數據 - 萃取/答案 - 萃取代碼（ utils.py and answer_extraction.py ）來自：https：//github.com/agi-edgerunners/plan-and-and-and-and-and-and-plan-and- solve-prompting

數據集參考：

GSM8K：

  @misc{cobbe2021training,
      title={Training Verifiers to Solve Math Word Problems}, 
      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Mark Chen and Heewoo Jun and Lukasz Kaiser and Matthias Plappert and Jerry Tworek and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
      year={2021},
      eprint={2110.14168},
      archivePrefix={arXiv},
      primaryClass={cs.LG}}

svamp

 @inproceedings{patel-etal-2021-nlp,
    title = "Are {NLP} Models really able to Solve Simple Math Word Problems?",
    author = "Patel, Arkil  and
      Bhattamishra, Satwik  and
      Goyal, Navin",
    booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
    month = jun,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.naacl-main.168",
    doi = "10.18653/v1/2021.naacl-main.168",
    pages = "2080--2094",
    abstract = "The problem of designing NLP solvers for math word problems (MWP) has seen sustained research activity and steady gains in the test accuracy. Since existing solvers achieve high performance on the benchmark datasets for elementary level MWPs containing one-unknown arithmetic word problems, such problems are often considered {``}solved{''} with the bulk of research attention moving to more complex MWPs. In this paper, we restrict our attention to English MWPs taught in grades four and lower. We provide strong evidence that the existing MWP solvers rely on shallow heuristics to achieve high performance on the benchmark datasets. To this end, we show that MWP solvers that do not have access to the question asked in the MWP can still solve a large fraction of MWPs. Similarly, models that treat MWPs as bag-of-words can also achieve surprisingly high accuracy. Further, we introduce a challenge dataset, SVAMP, created by applying carefully chosen variations over examples sampled from existing datasets. The best accuracy achieved by state-of-the-art models is substantially lower on SVAMP, thus showing that much remains to be done even for the simplest of the MWPs.",
}

Aqua：

 @inproceedings{ling-etal-2017-program,
    title = "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
    author = "Ling, Wang  and
      Yogatama, Dani  and
      Dyer, Chris  and
      Blunsom, Phil",
    booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2017",
    address = "Vancouver, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P17-1015",
    doi = "10.18653/v1/P17-1015",
    pages = "158--167",
    abstract = "Solving algebraic word problems requires executing a series of arithmetic operations{---}a program{---}to obtain a final answer. However, since programs can be arbitrarily complicated, inducing them directly from question-answer pairs is a formidable challenge. To make this task more feasible, we solve these problems by generating answer rationales, sequences of natural language and human-readable mathematical expressions that derive the final answer through a series of small steps. Although rationales do not explicitly specify programs, they provide a scaffolding for their structure via intermediate milestones. To evaluate our approach, we have created a new 100,000-sample dataset of questions, answers and rationales. Experimental results show that indirect supervision of program learning via answer rationales is a promising strategy for inducing arithmetic programs.",}

常識質量檢查

 @inproceedings{talmor-etal-2019-commonsenseqa,
    title = "{C}ommonsense{QA}: A Question Answering Challenge Targeting Commonsense Knowledge",
    author = "Talmor, Alon  and
      Herzig, Jonathan  and
      Lourie, Nicholas  and
      Berant, Jonathan",
    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
    month = jun,
    year = "2019",
    address = "Minneapolis, Minnesota",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/N19-1421",
    doi = "10.18653/v1/N19-1421",
    pages = "4149--4158",
    abstract = "When answering a question, people often draw upon their rich world knowledge in addition to the particular context. Recent work has focused primarily on answering questions given some relevant document or context, and required very little general background. To investigate question answering with prior knowledge, we present CommonsenseQA: a challenging new dataset for commonsense question answering. To capture common sense beyond associations, we extract from ConceptNet (Speer et al., 2017) multiple target concepts that have the same semantic relation to a single source concept. Crowd-workers are asked to author multiple-choice questions that mention the source concept and discriminate in turn between each of the target concepts. This encourages workers to create questions with complex semantics that often require prior knowledge. We create 12,247 questions through this procedure and demonstrate the difficulty of our task with a large number of strong baselines. Our best baseline is based on BERT-large (Devlin et al., 2018) and obtains 56{%} accuracy, well below human performance, which is 89{%}.",
}

StrategyQa

 @article{geva-etal-2021-aristotle,
    title = "Did Aristotle Use a Laptop? A Question Answering Benchmark with Implicit Reasoning Strategies",
    author = "Geva, Mor  and
      Khashabi, Daniel  and
      Segal, Elad  and
      Khot, Tushar  and
      Roth, Dan  and
      Berant, Jonathan",
    journal = "Transactions of the Association for Computational Linguistics",
    volume = "9",
    year = "2021",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthology.org/2021.tacl-1.21",
    doi = "10.1162/tacl_a_00370",
    pages = "346--361",
    abstract = "A key limitation in current datasets for multi-hop reasoning is that the required steps for answering the question are mentioned in it explicitly. In this work, we introduce StrategyQA, a question answering (QA) benchmark where the required reasoning steps are implicit in the question, and should be inferred using a strategy. A fundamental challenge in this setup is how to elicit such creative questions from crowdsourcing workers, while covering a broad range of potential strategies. We propose a data collection procedure that combines term-based priming to inspire annotators, careful control over the annotator population, and adversarial filtering for eliminating reasoning shortcuts. Moreover, we annotate each question with (1) a decomposition into reasoning steps for answering it, and (2) Wikipedia paragraphs that contain the answers to each step. Overall, StrategyQA includes 2,780 examples, each consisting of a strategy question, its decomposition, and evidence paragraphs. Analysis shows that questions in StrategyQA are short, topic-diverse, and cover a wide range of strategies. Empirically, we show that humans perform well (87{%}) on this task, while our best baseline reaches an accuracy of ∼ 66{%}.",
}

要求

請參閱requirements.txt （主要3個庫是擁抱面變壓器，VLLM和Pytorch-其餘的主要是依賴項。）
目前需要從源頭構建VLLM。使用此分支以保持一致性。

模型設置

代碼庫主要設置為與VLLM兼容的模型配合使用。

諸如駱駝教學和雷德蒙德之類的一些型號已經設置了。但是，請根據model.py中的需要更改模型權重路徑（請參閱Generator類的構造函數）。我使用本地下載的檢查點路徑，因此除非您在類似路徑中下載檢查點或更改路徑，否則它將無法使用。

如果要添加新型號（VLLM兼容），請執行以下操作：

在argparser.py中為model選項添加模型名稱。
在prompt.py中添加該特定模型名稱的提示模板（請參閱代碼文件末尾的示例）（可選；有一個默認提示，但可能不是最佳的）。
將模型名稱（您在argparser.py中定義的）與model.py中的檢查點路徑相關聯（請參閱該文件中的發電機類的構造函數以獲取示例）。

跑步

這是代碼執行的一般模板：

 python main.py --search_style=MultiSearch --model=LLAMA30_instruct --gpu_ids="0,1" --prompt_style=cot --dataset=gsm8k --reward_types=confidence+correctness

這將使用MultiSearch （請在下面解釋）使用零投入鏈（COT）提示（ cot ）提示（待在下面的搜索策略）作為GSM8K上的搜索策略（ confidence+correctness - 請在下面解釋）作為搜索指導，以作為GSM8K上的搜索策略作為搜索指導，以運行Llama 30b指令（COT）提示。。模型權重將分配給CUDA：0和CUDA：1（給定gpu_ids="0,1" ）。

其他一些顯著論點：

checkpoint - 如果要加載一些早期保存的檢查點（自動保存檢查點），請設置為True。
SC將其設置為真實，以啟用自我矛盾[1]。僅當search_style=none時相關。

每個參數的可用選項以及默認值可以在argparser.py中找到。

紀錄

您會在logs/

及時的樣式

在此項目中，我們在推理步驟級別採用各種搜索策略（每個推理步驟都算作“遊戲中的單一動作”）。這也提出瞭如何將一代分解為一系列步驟的問題。一種方法是創建具有清晰結構的任何任意提示模板（可以用於解析步驟），然後使用模板的一些鏡頭示例來啟動模型以遵循結構。但是，在這裡，我對零拍攝的製度感興趣。我嘗試以特定方式使用零射擊提示說明，以促進各種形式的自動分解步驟。在下面，我討論了該項目中使用的所有及相應分解方法。

經過思考鏈（ cot ） - 這使用標準的零件COT提示， Let's think step by step. [2]。對於步驟分解n （新線路）。還有一些其他設置可以正確忽略空的新線路等。最終，這不一定是分解推理步驟的理想方式，而COT結果中並非所有新線路都是完整的推理步驟，而是可以以零拍的方式完成的基線起點。
經過思考的鏈（ cot_step ） - 這是零鏡床的簡單擴展： Let's think step by step. Step 1:這會自動提出語言模型，以編號步驟組織其推理鏈（步驟1：xyz step2：abc ...）。然後可以輕鬆地用於分解步驟。
PS+ （ ps ） - 這是[3]中介紹的零擊計劃和求解提示（PS+版本）。新的線分解類似於cot 。
表格鏈（ cot_tab ） - 這是[4] - n|step|subquestion|process|result|n中介紹的零拍表cot提示這是產生結構化表格格式推理步驟的簡單方法。我們使用newline再次進行分解，但是與Newline分解在這裡更有意義之前不同 - 因為每個分解的NewLine都將與表中的一步相對應。
struct （ struct ） - 此提示使用以上許多提示的元素。它提供了將解決方案/答案分解為步驟和取代的詳細說明（具有子問題標識，相關事實和解決方案）。這會產生高度結構化的結果，並且可以根據類似cot結構進行分解。提示的詳細信息可以在prompt.py中找到，並且可以在node_transition.py中找到分解代碼。
結構最小值（ struct_min ） - 它類似於結構，一個較少的取代。 prompt.py中的詳細信息。我沒有運行此變體 - 可能會有錯誤。

您可以修改prompt.py添加一些射擊提示。

搜索樣式

所有搜索代碼都可以在Search/中找到。

none - 除了標準自動回歸貪婪解碼外，此方法不應用任何特定的搜索策略。這可以與SC=True結合使用，以進行多個採樣的自相連。
MultiSearch此策略使用多抽樣。然後，在事實後生成每個樣本的獎勵（每個分解步驟的累積獎勵）。獎勵用於以各種方式進行投票答案。
MultiGreedy - 此策略使用貪婪的搜索，但在步驟層面（與none級別（不同）。在每次迭代中，鑑於推理步驟的歷史鏈，該模型會生成一些K下一個推理步驟候選。然後對每個K候選人進行評分（分配了獎勵）。然後選擇最大得分候選人。該策略與初始推理步驟的多個樣本並行使用，這些樣本可導致多個搜索結果，可用於自穩態。可以將這種搜索策略視為類似於思想樹的DF [5]，但沒有任何回溯。
BeamSearch這是上面的光束搜索版本。該實現受[6]的啟發。此外，可以將此方法視為類似於思想樹[5]中使用的BFS方法（帶有截短的邊界）。
DivBeamSearch這與Beam搜索相同，但通過限制兄弟姐妹來鼓勵產生更多的多樣性。在每次迭代中，允許最大M（M <<束大小（I使用M = 2））兄弟姐妹。如果在選擇所有限制殘留候選者之後的所有候選者之後，梁尺寸的空間是根據獎勵添加的。這個想法在精神上與[7]相似，但是我們並沒有嚴格地以懲罰來修改評分的方程式 - 但如所述使用更多的硬約束。
MCTS這是蒙特卡洛樹搜索。實現大致遵循此處的結構。它從[8]中獲得了一些靈感。
SPMCTS此實現（半平行的蒙特卡洛樹搜索）更多地使MCT平行一些。它一次選擇多個葉子，然後一次推出多個路徑。因此，它需要更少的順序迭代。最終結果是與MCT相似的樣本。但是，實現在計算上不是等效的，也不是故意的。

注意，儘管某些方法受到先前工作的啟發，但它們並非試圖完全忠實地實現這些論文。

獎勵類型

與[6,8]相似，每個步驟的獎勵是根據自我評估和生成步驟的信心（基於logprobs）計算的。自我評估技術使用LLM來通過詢問多選問題（MCQ）來評估自己的世代，以了解步驟的有益性/正確性（可以在rewards.py中找到自評估問題）。

不同類型的獎勵類型和組合可以作為argparse.py中的參數提供。

confidence - 僅將信心（基於登錄過程）作為獎勵。
correctness - 僅使用步驟正確性相關的MCQ的答案概率作為獎勵。
helpfulness - 僅使用步驟中的答案概率有用相關的MCQ作為獎勵
both都使用correctness和helpfulness 。
confidence+correctness - 同時使用confidence和correctness 。
confidence+helpfulness - 利用confidence和helpfulness 。
confidence+both - 利用所有confidence ， correctness和helpfulness 。

NB：及時的樣式struct和struct-min遵循不同的規則。它們具有細粒的替代結構，並使用適合這些的獎勵。子問題識別替代僅使用有用的獎勵（因為從技術上講，這不是正確/不正確的推理步驟），而解決方案步驟僅使用與正確的問題（因為大概是，子問題的幫助將與之相關解決方案）。

可以在reward.py中找到用於不同提示的MCQ和用於自我評估的不同獎勵類型。

回答投票

在main.py （與任何argparse.py選項無關）中實現並自動跟踪幾種類型的答案投票機制。他們是：

多數投票（在日誌中Voted Answer ） - 簡單多數投票[1]。
獎勵投票（在日誌中Reward Voted Answer ） - 類似於多數投票，但每次投票的價值是獎勵（ $ in [0,1] $ ）相應的答案路徑，而不僅僅是1個
頂級K獎勵投票（ Top K Reward Voted Answer ） - 選擇頂級K（我們使用K = 5）最高獎勵答案，然後在其中應用獎勵投票。這允許過濾潛在的“不良”低獎勵答案，這可能會加起來投票。
最大獎勵（日誌中的Max Reward Answer ） - 選擇最大獎勵的答案。

限制

錯誤- 該項目更像是一個實驗原型。在更複雜的搜索策略中可能會有一些錯誤。它們確實運行，但是可能需要進行一些實施問題，我可能需要進行仔細檢查。就個人而言，在某些玩具跑步中，我對更複雜的方法的運氣並不多。通常，無或無+SC在難以挑戰的結果中效果很好。
批處理- 代碼中沒有任何批處理。它一次執行一個樣本。在SPMCT，BeamSearch等中有一些內部批處理，以便在多個孩子中奔跑獎勵，或者在梁中獲得多個父母節點的兒童候選人。但是所有這些都與一個問題/提示有關。我還試圖使node_transition.py的某些方面並行更多（尤其是獎勵計算），但在經驗時間成本中沒有太大的好處。
緩存- 瓶頸這些模型的性能的一個限制可能是缺乏鍵值緩存的重複使用。要獲得獎勵，我通常必須在計算推理步驟後終止。這將重置後代的緩存。此外，獎勵生成（自我評估）需要重建緩存。所有這些都需要多次重建KV緩存。更好的緩存重複使用可能會大大加快搜索策略（ none最快）。但這可能需要修改VLLM和擁抱面變壓器。
文檔- 我需要添加更多文檔（在此處或論文中）。但是與此同時，有關任何問題或其他任何問題，請聯繫電子郵件（鏈接到我的GitHub帳戶）。
MISC-不用說，還有其他無盡的東西可以像基於復雜性的稱重一樣，自動檢索具有合成地面真相（自舉提示）的示例，或者以少數射擊提示或多代理辯論的方式以此為單一。

相關作品

[1]自諧矛盾改善了語言模型中的思想推理鏈

 @inproceedings{
wang2023selfconsistency,
title={Self-Consistency Improves Chain of Thought Reasoning in Language Models},
author={Xuezhi Wang and Jason Wei and Dale Schuurmans and Quoc V Le and Ed H. Chi and Sharan Narang and Aakanksha Chowdhery and Denny Zhou},
booktitle={The Eleventh International Conference on Learning Representations },
year={2023},
url={https://openreview.net/forum?id=1PL1NIMMrw}
}

[2]大型語言模型是零拍的推理器

 @inproceedings{NEURIPS2022_8bb0d291,
 author = {Kojima, Takeshi and Gu, Shixiang (Shane) and Reid, Machel and Matsuo, Yutaka and Iwasawa, Yusuke},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},
 pages = {22199--22213},
 publisher = {Curran Associates, Inc.},
 title = {Large Language Models are Zero-Shot Reasoners},
 url = {https://proceedings.neurips.cc/paper_files/paper/2022/file/8bb0d291acd4acf06ef112099c16f326-Paper-Conference.pdf},
 volume = {35},
 year = {2022}
}

[3]計劃和解決提示：通過大語言模型改善零擊鏈的推理

 @inproceedings{wang-etal-2023-plan,
    title = "Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models",
    author = "Wang, Lei  and
      Xu, Wanyu  and
      Lan, Yihuai  and
      Hu, Zhiqiang  and
      Lan, Yunshi  and
      Lee, Roy Ka-Wei  and
      Lim, Ee-Peng",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.acl-long.147",
    pages = "2609--2634",
    abstract = "Large language models (LLMs) have recently been shown to deliver impressive performance in various NLP tasks. To tackle multi-step reasoning tasks, Few-shot chain-of-thought (CoT) prompting includes a few manually crafted step-by-step reasoning demonstrations which enable LLMs to explicitly generate reasoning steps and improve their reasoning task accuracy. To eliminate the manual efforts, Zero-shot-CoT concatenates the target problem statement with {``}textit{Let{'}s think step by step}{''} as an input prompt to LLMs. Despite the success of Zero-shot-CoT, it still suffers from three pitfalls: calculation errors, missing-step errors, and semantic misunderstanding errors. To address the missing-step errors, we propose Plan-and-Solve (PS) Prompting. It consists of two components: first, devising a plan to divide the entire task into smaller subtasks, and then carrying out the subtasks according to the plan. To address the calculation errors and improve the quality of generated reasoning steps, we extend PS prompting with more detailed instructions and derive PS+ prompting. We evaluate our proposed prompting strategy on ten datasets across three reasoning problems. The experimental results over GPT-3 show that our proposed zero-shot prompting consistently outperforms Zero-shot-CoT across all datasets by a large margin, is comparable to or exceeds Zero-shot-Program-of-Thought Prompting, and has comparable performance with 8-shot CoT prompting on the math reasoning problem. The code can be found at https://github.com/AGI-Edgerunners/Plan-and-Solve-Prompting.",
}

[4] tab-cot：零拍表的思想鏈

 @inproceedings{ziqi-lu-2023-tab,
    title = "Tab-{C}o{T}: Zero-shot Tabular Chain of Thought",
    author = "Ziqi, Jin  and
      Lu, Wei",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.findings-acl.651",
    pages = "10259--10277",
    abstract = "The chain-of-though (CoT) prompting methods were successful in various natural language processing (NLP) tasks thanks to their ability to unveil the underlying complex reasoning processes.Such reasoning processes typically exhibit highly structured steps.Recent efforts also started investigating methods to encourage more structured reasoning procedures to be captured (cite least to most).In this work, we propose Tab-CoT, a novel tabular-format CoT prompting method, which allows the complex reasoning process to be explicitly modeled in a highly structured manner.Despite its simplicity, we show that our approach is capable of performing reasoning across multiple dimensions (i.e., both rows and columns).We demonstrate our approach{'}s strong zero-shot and few-shot capabilities through extensive experiments on a range of reasoning tasks.",
}

[5]思想樹：大型語言模型的故意解決問題

 @misc{yao2023tree,
      title={Tree of Thoughts: Deliberate Problem Solving with Large Language Models}, 
      author={Shunyu Yao and Dian Yu and Jeffrey Zhao and Izhak Shafran and Thomas L. Griffiths and Yuan Cao and Karthik Narasimhan},
      year={2023},
      eprint={2305.10601},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

[6]分解通過自我評估引導解碼增強推理

 @misc{xie2023decomposition,
      title={Decomposition Enhances Reasoning via Self-Evaluation Guided Decoding}, 
      author={Yuxi Xie and Kenji Kawaguchi and Yiran Zhao and Xu Zhao and Min-Yen Kan and Junxian He and Qizhe Xie},
      year={2023},
      eprint={2305.00633},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

[7]一種簡單，快速的解碼算法，用於神經產生

 @misc{li2016simple,
      title={A Simple, Fast Diverse Decoding Algorithm for Neural Generation}, 
      author={Jiwei Li and Will Monroe and Dan Jurafsky},
      year={2016},
      eprint={1611.08562},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

[8]語言模型的推理是通過世界模型進行計劃

 @misc{hao2023reasoning,
      title={Reasoning with Language Model is Planning with World Model}, 
      author={Shibo Hao and Yi Gu and Haodi Ma and Joshua Jiahua Hong and Zhen Wang and Daisy Zhe Wang and Zhiting Hu},
      year={2023},
      eprint={2305.14992},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

展開