这更多是一个实验/研究项目。它实现了一条提示管道与包装器相结合的,用于自动分解步骤,并通过“踩踏空间”(例如,通过梁搜索,MCT等)进行搜索,以自我评估为指导。
一些数据 - 萃取/答案 - 萃取代码( utils.py
and answer_extraction.py
)来自:https://github.com/agi-edgerunners/plan-and-and-and-and-and-and-plan-and-solve-prompting
数据集参考:
@misc{cobbe2021training,
title={Training Verifiers to Solve Math Word Problems},
author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Mark Chen and Heewoo Jun and Lukasz Kaiser and Matthias Plappert and Jerry Tworek and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
year={2021},
eprint={2110.14168},
archivePrefix={arXiv},
primaryClass={cs.LG}}
@inproceedings{patel-etal-2021-nlp,
title = "Are {NLP} Models really able to Solve Simple Math Word Problems?",
author = "Patel, Arkil and
Bhattamishra, Satwik and
Goyal, Navin",
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-main.168",
doi = "10.18653/v1/2021.naacl-main.168",
pages = "2080--2094",
abstract = "The problem of designing NLP solvers for math word problems (MWP) has seen sustained research activity and steady gains in the test accuracy. Since existing solvers achieve high performance on the benchmark datasets for elementary level MWPs containing one-unknown arithmetic word problems, such problems are often considered {``}solved{''} with the bulk of research attention moving to more complex MWPs. In this paper, we restrict our attention to English MWPs taught in grades four and lower. We provide strong evidence that the existing MWP solvers rely on shallow heuristics to achieve high performance on the benchmark datasets. To this end, we show that MWP solvers that do not have access to the question asked in the MWP can still solve a large fraction of MWPs. Similarly, models that treat MWPs as bag-of-words can also achieve surprisingly high accuracy. Further, we introduce a challenge dataset, SVAMP, created by applying carefully chosen variations over examples sampled from existing datasets. The best accuracy achieved by state-of-the-art models is substantially lower on SVAMP, thus showing that much remains to be done even for the simplest of the MWPs.",
}
@inproceedings{ling-etal-2017-program,
title = "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
author = "Ling, Wang and
Yogatama, Dani and
Dyer, Chris and
Blunsom, Phil",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P17-1015",
doi = "10.18653/v1/P17-1015",
pages = "158--167",
abstract = "Solving algebraic word problems requires executing a series of arithmetic operations{---}a program{---}to obtain a final answer. However, since programs can be arbitrarily complicated, inducing them directly from question-answer pairs is a formidable challenge. To make this task more feasible, we solve these problems by generating answer rationales, sequences of natural language and human-readable mathematical expressions that derive the final answer through a series of small steps. Although rationales do not explicitly specify programs, they provide a scaffolding for their structure via intermediate milestones. To evaluate our approach, we have created a new 100,000-sample dataset of questions, answers and rationales. Experimental results show that indirect supervision of program learning via answer rationales is a promising strategy for inducing arithmetic programs.",}
@inproceedings{talmor-etal-2019-commonsenseqa,
title = "{C}ommonsense{QA}: A Question Answering Challenge Targeting Commonsense Knowledge",
author = "Talmor, Alon and
Herzig, Jonathan and
Lourie, Nicholas and
Berant, Jonathan",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N19-1421",
doi = "10.18653/v1/N19-1421",
pages = "4149--4158",
abstract = "When answering a question, people often draw upon their rich world knowledge in addition to the particular context. Recent work has focused primarily on answering questions given some relevant document or context, and required very little general background. To investigate question answering with prior knowledge, we present CommonsenseQA: a challenging new dataset for commonsense question answering. To capture common sense beyond associations, we extract from ConceptNet (Speer et al., 2017) multiple target concepts that have the same semantic relation to a single source concept. Crowd-workers are asked to author multiple-choice questions that mention the source concept and discriminate in turn between each of the target concepts. This encourages workers to create questions with complex semantics that often require prior knowledge. We create 12,247 questions through this procedure and demonstrate the difficulty of our task with a large number of strong baselines. Our best baseline is based on BERT-large (Devlin et al., 2018) and obtains 56{%} accuracy, well below human performance, which is 89{%}.",
}
@article{geva-etal-2021-aristotle,
title = "Did Aristotle Use a Laptop? A Question Answering Benchmark with Implicit Reasoning Strategies",
author = "Geva, Mor and
Khashabi, Daniel and
Segal, Elad and
Khot, Tushar and
Roth, Dan and
Berant, Jonathan",
journal = "Transactions of the Association for Computational Linguistics",
volume = "9",
year = "2021",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2021.tacl-1.21",
doi = "10.1162/tacl_a_00370",
pages = "346--361",
abstract = "A key limitation in current datasets for multi-hop reasoning is that the required steps for answering the question are mentioned in it explicitly. In this work, we introduce StrategyQA, a question answering (QA) benchmark where the required reasoning steps are implicit in the question, and should be inferred using a strategy. A fundamental challenge in this setup is how to elicit such creative questions from crowdsourcing workers, while covering a broad range of potential strategies. We propose a data collection procedure that combines term-based priming to inspire annotators, careful control over the annotator population, and adversarial filtering for eliminating reasoning shortcuts. Moreover, we annotate each question with (1) a decomposition into reasoning steps for answering it, and (2) Wikipedia paragraphs that contain the answers to each step. Overall, StrategyQA includes 2,780 examples, each consisting of a strategy question, its decomposition, and evidence paragraphs. Analysis shows that questions in StrategyQA are short, topic-diverse, and cover a wide range of strategies. Empirically, we show that humans perform well (87{%}) on this task, while our best baseline reaches an accuracy of ∼ 66{%}.",
}
requirements.txt
(主要3个库是拥抱面变压器,VLLM和Pytorch-其余的主要是依赖项。)代码库主要设置为与VLLM兼容的模型配合使用。
model.py
中的需要更改模型权重路径(请参阅Generator类的构造函数)。我使用本地下载的检查点路径,因此除非您在类似路径中下载检查点或更改路径,否则它将无法使用。如果要添加新型号(VLLM兼容),请执行以下操作:
argparser.py
中为model
选项添加模型名称。prompt.py
中添加该特定模型名称的提示模板(请参阅代码文件末尾的示例)(可选;有一个默认提示,但可能不是最佳的)。model.py
中的检查点路径相关联(请参阅该文件中的发电机类的构造函数以获取示例)。这是代码执行的一般模板:
python main.py --search_style=MultiSearch --model=LLAMA30_instruct --gpu_ids="0,1" --prompt_style=cot --dataset=gsm8k --reward_types=confidence+correctness
这将使用MultiSearch
(请在下面解释)使用零摄链( cot
)提示的Llama 30b指令作为GSM8K
上的搜索策略( confidence+correctness
- 请在下面解释),以供搜索指导。模型权重将分配给CUDA:0和CUDA:1(给定gpu_ids="0,1"
)。
其他一些显着论点:
checkpoint
- 如果要加载一些早期保存的检查点(自动保存检查点),请设置为True。SC
将其设置为真实,以启用自我矛盾[1]。仅当search_style=none
时相关。每个参数的可用选项以及默认值可以在argparser.py
中找到。
您会在logs/
在此项目中,我们在推理步骤级别采用各种搜索策略(每个推理步骤都算作“游戏中的单一动作”)。这也提出了如何将一代分解为一系列步骤的问题。一种方法是创建具有清晰结构的任何任意提示模板(可以用于解析步骤),然后使用模板的一些镜头示例来启动模型以遵循结构。但是,在这里,我对零拍摄的制度感兴趣。我尝试以特定方式使用零射击提示说明,以促进各种形式的自动分解步骤。在下面,我讨论了该项目中使用的所有及相应分解方法。
经过思考链( cot
) - 这使用标准的零件COT提示, Let's think step by step.
[2]。对于步骤分解n
(新线路)。还有一些其他设置可以正确忽略空的新线路等。最终,这不一定是分解推理步骤的理想方式,而COT结果中并非所有新线路都是完整的推理步骤,而是可以以零拍的方式完成的基线起点。
经过思考的链( cot_step
) - 这是零镜床的简单扩展: Let's think step by step. Step 1:
这会自动提出语言模型,以编号步骤组织其推理链(步骤1:xyz step2:abc ...)。然后可以轻松地用于分解步骤。
PS+ ( ps
) - 这是[3]中介绍的零击计划和求解提示(PS+版本)。新的线分解类似于cot
。
表格链( cot_tab
) - 这是[4] - n|step|subquestion|process|result|n
中介绍的零拍表cot提示这是产生结构化表格格式推理步骤的简单方法。我们使用newline再次进行分解,但是与Newline分解在这里更有意义之前不同 - 因为每个分解的NewLine都将与表中的一步相对应。
struct ( struct
) - 此提示使用以上许多提示的元素。它提供了将解决方案/答案分解为步骤和取代的详细说明(具有子问题标识,相关事实和解决方案)。这会产生高度结构化的结果,并且可以根据类似cot
结构进行分解。提示的详细信息可以在prompt.py
中找到,并且可以在node_transition.py
中找到分解代码。
结构最小值( struct_min
) - 它类似于结构,一个较少的取代。 prompt.py
中的详细信息。我没有运行此变体 - 可能会有错误。
您可以修改prompt.py
添加一些射击提示。
所有搜索代码都可以在Search/
中找到。
none
- 除了标准自动回归贪婪解码外,此方法不应用任何特定的搜索策略。这可以与SC=True
结合使用,以进行多个采样的自相连。MultiSearch
此策略使用多抽样。然后,在事实后生成每个样本的奖励(每个分解步骤的累积奖励)。奖励用于以各种方式进行投票答案。MultiGreedy
- 此策略使用贪婪的搜索,但在步骤层面(与none
级别(不同)。在每次迭代中,鉴于推理步骤的历史链,该模型会生成一些K下一个推理步骤候选。然后对每个K候选人进行评分(分配了奖励)。然后选择最大得分候选人。该策略与初始推理步骤的多个样本并行使用,这些样本可导致多个搜索结果,可用于自稳态。可以将这种搜索策略视为类似于思想树的DF [5],但没有任何回溯。BeamSearch
这是上面的光束搜索版本。该实现受[6]的启发。此外,可以将此方法视为类似于思想树[5]中使用的BFS方法(带有截短的边界)。DivBeamSearch
这与Beam搜索相同,但通过限制兄弟姐妹来鼓励产生更多的多样性。在每次迭代中,允许最大M(M <<束大小(I使用M = 2))兄弟姐妹。如果在选择所有限制残留候选者之后的所有候选者之后,梁尺寸的空间是根据奖励添加的。这个想法在精神上与[7]相似,但是我们并没有严格地以惩罚来修改评分的方程式 - 但如所述使用更多的硬约束。MCTS
这是蒙特卡洛树搜索。实现大致遵循此处的结构。它从[8]中获得了一些灵感。SPMCTS
此实现(半平行的蒙特卡洛树搜索)更多地使MCT平行一些。它一次选择多个叶子,然后一次推出多个路径。因此,它需要更少的顺序迭代。最终结果是与MCT相似的样本。但是,实现在计算上不是等效的,也不是故意的。注意,尽管某些方法受到先前工作的启发,但它们并非试图完全忠实地实现这些论文。
与[6,8]相似,每个步骤的奖励是根据自我评估和生成步骤的信心(基于logprobs)计算的。自我评估技术使用LLM来通过询问多选问题(MCQ)来评估自己的世代,以了解步骤的有益性/正确性(可以在rewards.py
中找到自评估问题)。
不同类型的奖励类型和组合可以作为argparse.py
中的参数提供。
confidence
- 仅将信心(基于登录过程)作为奖励。correctness
- 仅使用步骤正确性相关的MCQ的答案概率作为奖励。helpfulness
- 仅使用步骤中的答案概率有用相关的MCQ作为奖励both
都使用correctness
和helpfulness
。confidence+correctness
- 同时使用confidence
和correctness
。confidence+helpfulness
- 利用confidence
和helpfulness
。confidence+both
- 利用所有confidence
, correctness
和helpfulness
。 NB:及时的样式struct
和struct-min
遵循不同的规则。它们具有细粒的替代结构,并使用适合这些的奖励。子问题识别替代仅使用有用的奖励(因为从技术上讲,这不是正确/不正确的推理步骤),而解决方案步骤仅使用与正确的问题(因为大概是,子问题的帮助将与之相关解决方案)。
可以在reward.py
中找到用于不同提示的MCQ和用于自我评估的不同奖励类型。
在main.py
(与任何argparse.py
选项无关)中实现并自动跟踪几种类型的答案投票机制。他们是:
Voted Answer
) - 简单多数投票[1]。Reward Voted Answer
) - 类似于多数投票,但每次投票的价值是奖励( Top K Reward Voted Answer
) - 选择顶级K(我们使用K = 5)最高奖励答案,然后在其中应用奖励投票。这允许过滤潜在的“不良”低奖励答案,这可能会加起来投票。Max Reward Answer
) - 选择最大奖励的答案。node_transition.py
的某些方面并行更多(尤其是奖励计算),但在经验时间成本中没有太大的好处。none
最快)。但这可能需要修改VLLM和拥抱面变压器。[1]自谐矛盾改善了语言模型中的思想推理链
@inproceedings{
wang2023selfconsistency,
title={Self-Consistency Improves Chain of Thought Reasoning in Language Models},
author={Xuezhi Wang and Jason Wei and Dale Schuurmans and Quoc V Le and Ed H. Chi and Sharan Narang and Aakanksha Chowdhery and Denny Zhou},
booktitle={The Eleventh International Conference on Learning Representations },
year={2023},
url={https://openreview.net/forum?id=1PL1NIMMrw}
}
[2]大型语言模型是零拍的推理器
@inproceedings{NEURIPS2022_8bb0d291,
author = {Kojima, Takeshi and Gu, Shixiang (Shane) and Reid, Machel and Matsuo, Yutaka and Iwasawa, Yusuke},
booktitle = {Advances in Neural Information Processing Systems},
editor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},
pages = {22199--22213},
publisher = {Curran Associates, Inc.},
title = {Large Language Models are Zero-Shot Reasoners},
url = {https://proceedings.neurips.cc/paper_files/paper/2022/file/8bb0d291acd4acf06ef112099c16f326-Paper-Conference.pdf},
volume = {35},
year = {2022}
}
[3]计划和解决提示:通过大语言模型改善零击链的推理
@inproceedings{wang-etal-2023-plan,
title = "Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models",
author = "Wang, Lei and
Xu, Wanyu and
Lan, Yihuai and
Hu, Zhiqiang and
Lan, Yunshi and
Lee, Roy Ka-Wei and
Lim, Ee-Peng",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.147",
pages = "2609--2634",
abstract = "Large language models (LLMs) have recently been shown to deliver impressive performance in various NLP tasks. To tackle multi-step reasoning tasks, Few-shot chain-of-thought (CoT) prompting includes a few manually crafted step-by-step reasoning demonstrations which enable LLMs to explicitly generate reasoning steps and improve their reasoning task accuracy. To eliminate the manual efforts, Zero-shot-CoT concatenates the target problem statement with {``}textit{Let{'}s think step by step}{''} as an input prompt to LLMs. Despite the success of Zero-shot-CoT, it still suffers from three pitfalls: calculation errors, missing-step errors, and semantic misunderstanding errors. To address the missing-step errors, we propose Plan-and-Solve (PS) Prompting. It consists of two components: first, devising a plan to divide the entire task into smaller subtasks, and then carrying out the subtasks according to the plan. To address the calculation errors and improve the quality of generated reasoning steps, we extend PS prompting with more detailed instructions and derive PS+ prompting. We evaluate our proposed prompting strategy on ten datasets across three reasoning problems. The experimental results over GPT-3 show that our proposed zero-shot prompting consistently outperforms Zero-shot-CoT across all datasets by a large margin, is comparable to or exceeds Zero-shot-Program-of-Thought Prompting, and has comparable performance with 8-shot CoT prompting on the math reasoning problem. The code can be found at https://github.com/AGI-Edgerunners/Plan-and-Solve-Prompting.",
}
[4] tab-cot:零拍表的思想链
@inproceedings{ziqi-lu-2023-tab,
title = "Tab-{C}o{T}: Zero-shot Tabular Chain of Thought",
author = "Ziqi, Jin and
Lu, Wei",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.651",
pages = "10259--10277",
abstract = "The chain-of-though (CoT) prompting methods were successful in various natural language processing (NLP) tasks thanks to their ability to unveil the underlying complex reasoning processes.Such reasoning processes typically exhibit highly structured steps.Recent efforts also started investigating methods to encourage more structured reasoning procedures to be captured (cite least to most).In this work, we propose Tab-CoT, a novel tabular-format CoT prompting method, which allows the complex reasoning process to be explicitly modeled in a highly structured manner.Despite its simplicity, we show that our approach is capable of performing reasoning across multiple dimensions (i.e., both rows and columns).We demonstrate our approach{'}s strong zero-shot and few-shot capabilities through extensive experiments on a range of reasoning tasks.",
}
[5]思想树:大型语言模型的故意解决问题
@misc{yao2023tree,
title={Tree of Thoughts: Deliberate Problem Solving with Large Language Models},
author={Shunyu Yao and Dian Yu and Jeffrey Zhao and Izhak Shafran and Thomas L. Griffiths and Yuan Cao and Karthik Narasimhan},
year={2023},
eprint={2305.10601},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
[6]分解通过自我评估引导解码增强推理
@misc{xie2023decomposition,
title={Decomposition Enhances Reasoning via Self-Evaluation Guided Decoding},
author={Yuxi Xie and Kenji Kawaguchi and Yiran Zhao and Xu Zhao and Min-Yen Kan and Junxian He and Qizhe Xie},
year={2023},
eprint={2305.00633},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
[7]一种简单,快速的解码算法,用于神经产生
@misc{li2016simple,
title={A Simple, Fast Diverse Decoding Algorithm for Neural Generation},
author={Jiwei Li and Will Monroe and Dan Jurafsky},
year={2016},
eprint={1611.08562},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
[8]语言模型的推理是通过世界模型进行计划
@misc{hao2023reasoning,
title={Reasoning with Language Model is Planning with World Model},
author={Shibo Hao and Yi Gu and Haodi Ma and Joshua Jiahua Hong and Zhen Wang and Daisy Zhe Wang and Zhiting Hu},
year={2023},
eprint={2305.14992},
archivePrefix={arXiv},
primaryClass={cs.CL}
}