تنزيل ZeroPromptSearch - ZeroPromptSearch Source Code Download

يطالب براقة صفرية لتحلل الخطوة والبحث

هذا هو أكثر من مشروع تجريبي/بحثي. وهو ينفذ خط أنابيب مطالبة جنبًا إلى جنب مع غلاف للخطوات المتنافسة تلقائيًا وبحثًا من خلال "مساحة الخطوة" (على سبيل المثال ، عن طريق البحث عن الشعاع ، MCTS ، إلخ) يسترشد بالتقييم الذاتي.

الاعتمادات:

يتم تكييف بعض رموز الاستثمار/الإجابة على البيانات ( utils.py و answer_extraction.py ) من:

مراجع مجموعة البيانات:

GSM8K:

  @misc{cobbe2021training,
      title={Training Verifiers to Solve Math Word Problems}, 
      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Mark Chen and Heewoo Jun and Lukasz Kaiser and Matthias Plappert and Jerry Tworek and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
      year={2021},
      eprint={2110.14168},
      archivePrefix={arXiv},
      primaryClass={cs.LG}}

سفامب

 @inproceedings{patel-etal-2021-nlp,
    title = "Are {NLP} Models really able to Solve Simple Math Word Problems?",
    author = "Patel, Arkil  and
      Bhattamishra, Satwik  and
      Goyal, Navin",
    booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
    month = jun,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.naacl-main.168",
    doi = "10.18653/v1/2021.naacl-main.168",
    pages = "2080--2094",
    abstract = "The problem of designing NLP solvers for math word problems (MWP) has seen sustained research activity and steady gains in the test accuracy. Since existing solvers achieve high performance on the benchmark datasets for elementary level MWPs containing one-unknown arithmetic word problems, such problems are often considered {``}solved{''} with the bulk of research attention moving to more complex MWPs. In this paper, we restrict our attention to English MWPs taught in grades four and lower. We provide strong evidence that the existing MWP solvers rely on shallow heuristics to achieve high performance on the benchmark datasets. To this end, we show that MWP solvers that do not have access to the question asked in the MWP can still solve a large fraction of MWPs. Similarly, models that treat MWPs as bag-of-words can also achieve surprisingly high accuracy. Further, we introduce a challenge dataset, SVAMP, created by applying carefully chosen variations over examples sampled from existing datasets. The best accuracy achieved by state-of-the-art models is substantially lower on SVAMP, thus showing that much remains to be done even for the simplest of the MWPs.",
}

أكوا:

 @inproceedings{ling-etal-2017-program,
    title = "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
    author = "Ling, Wang  and
      Yogatama, Dani  and
      Dyer, Chris  and
      Blunsom, Phil",
    booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2017",
    address = "Vancouver, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P17-1015",
    doi = "10.18653/v1/P17-1015",
    pages = "158--167",
    abstract = "Solving algebraic word problems requires executing a series of arithmetic operations{---}a program{---}to obtain a final answer. However, since programs can be arbitrarily complicated, inducing them directly from question-answer pairs is a formidable challenge. To make this task more feasible, we solve these problems by generating answer rationales, sequences of natural language and human-readable mathematical expressions that derive the final answer through a series of small steps. Although rationales do not explicitly specify programs, they provide a scaffolding for their structure via intermediate milestones. To evaluate our approach, we have created a new 100,000-sample dataset of questions, answers and rationales. Experimental results show that indirect supervision of program learning via answer rationales is a promising strategy for inducing arithmetic programs.",}

المنطقي QA

 @inproceedings{talmor-etal-2019-commonsenseqa,
    title = "{C}ommonsense{QA}: A Question Answering Challenge Targeting Commonsense Knowledge",
    author = "Talmor, Alon  and
      Herzig, Jonathan  and
      Lourie, Nicholas  and
      Berant, Jonathan",
    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
    month = jun,
    year = "2019",
    address = "Minneapolis, Minnesota",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/N19-1421",
    doi = "10.18653/v1/N19-1421",
    pages = "4149--4158",
    abstract = "When answering a question, people often draw upon their rich world knowledge in addition to the particular context. Recent work has focused primarily on answering questions given some relevant document or context, and required very little general background. To investigate question answering with prior knowledge, we present CommonsenseQA: a challenging new dataset for commonsense question answering. To capture common sense beyond associations, we extract from ConceptNet (Speer et al., 2017) multiple target concepts that have the same semantic relation to a single source concept. Crowd-workers are asked to author multiple-choice questions that mention the source concept and discriminate in turn between each of the target concepts. This encourages workers to create questions with complex semantics that often require prior knowledge. We create 12,247 questions through this procedure and demonstrate the difficulty of our task with a large number of strong baselines. Our best baseline is based on BERT-large (Devlin et al., 2018) and obtains 56{%} accuracy, well below human performance, which is 89{%}.",
}

الاستراتيجية

 @article{geva-etal-2021-aristotle,
    title = "Did Aristotle Use a Laptop? A Question Answering Benchmark with Implicit Reasoning Strategies",
    author = "Geva, Mor  and
      Khashabi, Daniel  and
      Segal, Elad  and
      Khot, Tushar  and
      Roth, Dan  and
      Berant, Jonathan",
    journal = "Transactions of the Association for Computational Linguistics",
    volume = "9",
    year = "2021",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthology.org/2021.tacl-1.21",
    doi = "10.1162/tacl_a_00370",
    pages = "346--361",
    abstract = "A key limitation in current datasets for multi-hop reasoning is that the required steps for answering the question are mentioned in it explicitly. In this work, we introduce StrategyQA, a question answering (QA) benchmark where the required reasoning steps are implicit in the question, and should be inferred using a strategy. A fundamental challenge in this setup is how to elicit such creative questions from crowdsourcing workers, while covering a broad range of potential strategies. We propose a data collection procedure that combines term-based priming to inspire annotators, careful control over the annotator population, and adversarial filtering for eliminating reasoning shortcuts. Moreover, we annotate each question with (1) a decomposition into reasoning steps for answering it, and (2) Wikipedia paragraphs that contain the answers to each step. Overall, StrategyQA includes 2,780 examples, each consisting of a strategy question, its decomposition, and evidence paragraphs. Analysis shows that questions in StrategyQA are short, topic-diverse, and cover a wide range of strategies. Empirically, we show that humans perform well (87{%}) on this task, while our best baseline reaches an accuracy of ∼ 66{%}.",
}

متطلبات

راجع requirements.txt (المكتبات الرئيسية الثلاثة الرئيسية هي محولات Luggingface و VLLM و Pytorch - الباقي في الغالب تبعيات.)
يجب بناء VLLM من المصدر في الوقت الحالي. استخدم هذا الفرع للاتساق.

إعداد نموذج

يتم إعداد قاعدة التعليمات البرمجية بشكل أساسي للعمل مع نماذج متوافقة مع VLLM.

تم إعداد عدد قليل من الطرز مثل Llama-instruct و Redmond بالفعل. لكن قم بتغيير مسارات الوزن النموذجية كما تحتاج في model.py (انظر مُنشئ فئة المولد). يمكنني استخدام مسارات نقاط التفتيش التي تم تنزيلها محليًا حتى لا تعمل خارج المربع إلا إذا قمت بتنزيل نقاط التفتيش في مسار مماثل أو تغيير المسار.

إذا كنت ترغب في إضافة نموذج جديد (متوافق مع VLLM) افعل ما يلي:

أضف اسم النموذج في argparser.py لخيار model .
إضافة قالب موجه لهذا اسم النموذج المحدد في prompt.py (انظر الأمثلة في نهاية ملف الرمز) (اختياري ؛ هناك موجه افتراضي ولكن ربما لن يكون مثاليًا).
قم بربط اسم النموذج (الاسم الذي حددته في Argparser.py) بمسار نقطة تفتيش في model.py (انظر مُنشئ فئة المولد في هذا الملف للحصول على أمثلة).

يجري

هذا قالب عام لتنفيذ الكود:

 python main.py --search_style=MultiSearch --model=LLAMA30_instruct --gpu_ids="0,1" --prompt_style=cot --dataset=gsm8k --reward_types=confidence+correctness

هذا من شأنه أن يدير LLAMA 30B إرشادات مع موجه سلسلة من الفكرة ( cot ) باستخدام MultiSearch (المراد شرحه أدناه) كاستراتيجية بحث على GSM8K باستخدام وظيفة المكافأة ( confidence+correctness -يتم شرحها أدناه) لتوجيهات البحث . سيتم توزيع أوزان النموذج على CUDA: 0 و CUDA: 1 (معطى gpu_ids="0,1" ).

بعض الحجج البارزة الأخرى:

checkpoint - قم بتعيينها بشكل صحيح إذا كنت تقوم بتحميل بعض نقاط التفتيش المحفوظة سابقة (يتم حفظ نقاط التفتيش تلقائيًا)
SC - اضبطها لتمكين التوافق الذاتي [1]. ذات صلة فقط إذا كان search_style=none .

يمكن العثور على الخيارات المتاحة لكل وسيطة والافتراضات في argparser.py .

سجلات

ستجد سجلات التنفيذ في logs/

أنماط سريعة

في هذا المشروع ، نستخدم استراتيجيات البحث المختلفة على مستوى خطوات التفكير (تعتبر كل خطوة تفكير "خطوة واحدة في اللعبة"). هذا يثير أيضًا مسألة كيفية تحلل الجيل إلى سلسلة من الخطوات. تتمثل إحدى طرق القيام بذلك في إنشاء أي قالب موجه تعسفي بهيكل واضح (يمكن استخدامه لخطوات التحليل) ثم استخدام بعض الأمثلة اللقطة مع القالب لتوضيح النموذج لمتابعة الهيكل. هنا ، ومع ذلك ، أنا مهتم بنظام الصفر. أحاول استخدام إرشادات موجه صفريًا بطرق محددة للتحريض على أشكال مختلفة من التحلل التلقائي للخطوات. أدناه ، أناقش جميع الأساليب السريعة المستخدمة في هذا المشروع ومنهجية التحلل المقابلة لها.

سلسلة من الفرق ( cot )-هذا يستخدم موجه Cot Zero-Shot القياسي ، Let's think step by step. [2]. يتم استخدام تحلل الخطوة n (خط جديد). هناك بعض الإعدادات الإضافية لتجاهل الخطوط الجديدة الفارغة بشكل صحيح. في النهاية ، هذه ليست بالضرورة طريقة مثالية لتحلل خطوات التفكير ، وليس كل الخطوط الجديدة في نتائج COT هي خطوات التفكير الكاملة ولكنها نقطة انطلاق أساسية يمكن القيام بها بطريقة صفرية.
خطوة سلسلة من الأهمية ( cot_step )-هذا امتداد بسيط لسرير الرصاص الصفري: Let's think step by step. Step 1: هذا يبرز نموذج اللغة تلقائيًا لتنظيم سلسلة التفكير في الخطوات المرقمة (الخطوة 1: XYZ Step2: ABC ...). يمكن استخدام هذا الهيكل بسهولة لتحليل الخطوات.
PS+ ( ps ) - هذه هي خطة الرصاص الصفرية وحل موجه (إصدار PS+) تم تقديمه في [3]. يتم استخدام تحلل الخط الجديد مماثلة ل cot .
سلسلة جدولة من أكثر ( cot_tab )-هذا هو موجه سرير جدولي صفري تم تقديمه في [4]- n|step|subquestion|process|result|n إنها طريقة بسيطة لإنتاج خطوات تفكير مجدولة منظمة. نستخدم NewLine للتحلل مرة أخرى ، ولكن على عكس أن يكون التحلل الجديد أكثر جدوى هنا - لأن كل خط جديد يتحلل سوف يتوافق مع خطوة في الجدول.
بنية ( struct ) - يستخدم هذا المطالبة عناصر للعديد من المطالبات المذكورة أعلاه. ويوفر تعليمات مفصلة لتحليل الحل/الإجابة في الخطوات والبدائل (مع تحديد المشكلات الفرعية ، والحقائق ذات الصلة ، والحل). هذا ينتج نتائج منظمة للغاية ويمكن أن يتحلل وفقًا للهيكل المشابه لـ cot . يمكن العثور على تفاصيل المطالبة في prompt.py ويمكن الاطلاع على رمز التحلل في node_transition.py .
بنية الحد الأدنى ( struct_min ) - يشبه البنية مع واحد أقل بديلة. التفاصيل في prompt.py . لم أقم بتشغيل هذا البديل - يمكن أن يكون هناك أخطاء.

يمكنك تعديل prompt.py لإضافة بعض مطالبات اللقطة.

أنماط البحث

يمكن العثور على جميع رموز البحث في Search/ .

none - لا تطبق هذه الطريقة أي استراتيجية بحث معينة إلى جانب فك تشفير الجشع التلقائي القياسي. يمكن دمج ذلك مع SC=True للاتساق الذاتي مع أخذ العينات المتعددة.
MultiSearch - تستخدم هذه الاستراتيجية أخذ العينات متعددة. ثم يتم إنشاء المكافآت لكل عينة (مكافأة تراكمية لكل خطوة متحللة) بعد الحقيقة. يتم استخدام المكافآت لتصويت إجابات بطرق مختلفة لوصفها لاحقًا.
MultiGreedy - تستخدم هذه الاستراتيجية البحث الجشع ولكن على مستوى الخطوات (على عكس none ). في كل تكرار ، بالنظر إلى سلسلة تاريخ خطوات التفكير ، يولد النموذج بعض المرشحين لخطوة التفكير التالية. ثم يتم تسجيل كل من المرشحين K (تعيين مكافأة). ثم يتم اختيار مرشح تسجيل الحد الأقصى. يتم استخدام هذه الاستراتيجية بالتوازي مع عينات متعددة من خطوات التفكير الأولية التي تؤدي إلى نتائج بحث متعددة يمكن استخدامها للاتساق الذاتي. يمكن اعتبار استراتيجية البحث هذه مشابهة لـ DFS من شجرة الفكر [5] ولكن دون أي تراجع.
BeamSearch - هذا هو إصدار بحث الشعاع مما سبق. التنفيذ مستوحى من [6]. علاوة على ذلك ، يمكن اعتبار هذه الطريقة مشابهة لطريقة BFS (مع الحدود المقطوعة) المستخدمة في شجرة الفكر [5].
DivBeamSearch - هذا هو نفس البحث عن الشعاع ولكنه يشجع المزيد من التنوع في التوليد من خلال تقييد الأشقاء. في كل تكرار بحد أقصى m (m << حجم الشعاع (أنا استخدم M = 2)) يُسمح بالأشقاء. إذا كان هناك مساحة في حجم الحزمة بعد اختيار جميع المرشحين الذين يتبعون هذا ، يتم إضافة هذا تقييد المرشحين المتبقيين بناءً على مكافآتهم. تتشابه الفكرة في الروح مع [7] لكننا لا نقوم بتعديل معادلة التسجيل بدقة مع ركلة جزاء - ولكن نستخدم المزيد من القيد الصعب كما هو موضح.
MCTS - هذا هو Monte Carlo Tree Search. يتبع التنفيذ الهيكل هنا تقريبًا. يستغرق بعض الإلهام من [8].
SPMCTS - هذا التنفيذ (Semi -paralled Monte Carlo Tree Search) يوازي MCTs أكثر قليلاً. يختار الأوراق المتعددة في وقت واحد ويطرح مسارات متعددة في وقت واحد. على هذا النحو ، فإنه يتطلب أقل تكرارات متسلسلة. والنتيجة النهائية هي عدد مشابه من العينات مثل MCTs. التطبيقات ليست مكافئة حسابية ومع ذلك وليس المقصود أن تكون.

ملاحظة في حين أن بعض الأساليب مستوحاة من العمل السابق ، إلا أنها لا تحاول أن تكون تطبيقات مخلصة تمامًا لتلك الأوراق.

أنواع المكافآت

على غرار [6،8] يتم حساب المكافأة لكل خطوة بناءً على التقييم الذاتي وثقة توليد الخطوة (استنادًا إلى logprobs). تستخدم تقنية التقييم الذاتي LLMs لتقييم أجيالها الخاصة من خلال طرح أسئلة متعددة الاختيار (MCQ) حول مساعدة/صحة الخطوة (يمكن العثور على أسئلة التقييم الذاتي في rewards.py .

تتوفر أنواع مختلفة من أنواع المكافآت والمجموعات كوسائط في argparse.py .

confidence - لا تستخدم سوى الثقة (استنادًا إلى LogProbs) كمكافأة.
correctness - يستخدم فقط احتمالات الإجابة من خطوة MCQ المتعلقة بالصحة كمكافأة.
helpfulness - لا تستخدم سوى احتمالات الإجابة من خطوة MCQ المتعلقة بالمساعدة كمكافأة
both - يستخدم كل من correctness helpfulness .
confidence+correctness - يستخدم confidence correctness .
confidence+helpfulness - يستخدم confidence helpfulness .
confidence+both - يستخدم كل confidence correctness helpfulness .

NB: struct الأساليب المذهلة و struct-min يتبع قواعد مختلفة. لديهم هياكل بديلة الحبيبات واستخدام المكافآت المناسبة لهؤلاء. يستخدم بديل تعريف المشكلات الفرعية فقط مكافأة متعلقة بالمساعدة (لأنها من الناحية الفنية ليست خطوة تفكير لتكون صحيحة/غير صحيحة) وتستخدم خطوة الحل فقط السؤال المتعلق بالصحة (لأنه من المفترض أن تكون مساعدة المشكلات الفرعية ترتبط بمساعدتها الحل).

يمكن العثور على MCQs المستخدمة لمطالبات مختلفة وأنواع المكافآت المختلفة للتقييم الذاتي في reward.py .

الإجابة على التصويت

يتم تنفيذ عدة أنواع من آليات التصويت للإجابة وتتبعها تلقائيًا في وقت واحد في main.py (لا تتعلق بأي خيارات argparse.py ). هم:

الأغلبية التصويت ( Voted Answer في السجلات) - مجرد تصويت أغلبية بسيطة [1].
مكافأة التصويت ( Reward Voted Answer في السجلات) - على غرار التصويت الأغلبية ، لكن قيمة كل تصويت هي المكافأة ( $ in [0،1] $ ) من مسار الإجابة المقابل بدلاً من 1 فقط للجميع ،
Top K Reward Potting ( Top K Reward Voted Answer in Logs) - حدد Top K (نستخدم K = 5) أعلى إجابات مكافأة ثم تطبيق مكافأة التصويت فيما بينها. هذا يسمح لتصفية الإجابات المنخفضة "السيئة" التي يمكن أن تضيف ما يصل إلى الأصوات.
MAX RIWARD ( Max Reward Answer في السجلات) - حدد الإجابة باستخدام المكافأة القصوى.

القيود

الأخطاء - المشروع هو أكثر من نموذج أولي تجريبي. يمكن أن يكون هناك بعض الأخطاء في استراتيجيات البحث الأكثر تطوراً. إنها تعمل ، ولكن يمكن أن تكون هناك بعض مشكلات التنفيذ التي قد أحتاج إلى التحقق منها. أنا شخصياً لم يكن لدي حظ كبير في الأساليب الأكثر تطوراً في بعض الألعاب. بشكل عام ، لا شيء أو لا شيء+SC يعمل بشكل جيد مع نتائج يصعب الانتهابات.
دفعة - لا يوجد أي دفع في الكود. ينفذ عينة واحدة في وقت واحد. هناك بعض التضمين الداخلي في SPMCTs أو Beamsearch أو مثل مكافآت الجري في العديد من الأطفال ، أو الحصول على مرشحين للأطفال لعقد الوالدين المتعددة في الحزمة. ولكن كل ما يرتبط بسؤال واحد/موجه. حاولت أيضًا موازاة بعض جوانب node_transition.py أكثر قليلاً (خاصة حساب المكافآت) ولكن لم تحصل على فائدة كبيرة في تكلفة الوقت التجريبي.
ذاكرة التخزين المؤقت - من المفترض أن يكون أحد القيود التي تحدد أداء هذه النماذج هو عدم استخدام التخزين المؤقت للقيمة الرئيسية. للحصول على المكافآت ، يجب أن أنتهي بشكل عام بعد حساب خطوة التفكير. هذا يعيد ضبط ذاكرة التخزين المؤقت للأجيال القادمة. علاوة على ذلك ، يتطلب توليد المكافآت (التقييم الذاتي) إعادة بناء ذاكرة التخزين المؤقت. كل هذه تتطلب إعادة بناء ذاكرة التخزين المؤقت KV عدة مرات. قد يؤدي إعادة استخدام ذاكرة التخزين المؤقت بشكل أفضل إلى تسريع استراتيجيات البحث بشكل كبير إلى جانب none (الذي هو حاليًا الأسرع). ولكن هذا قد يتطلب تعديل محولات VLLM و uggingface.
الوثائق - أحتاج إلى إضافة المزيد من الوثائق (هنا أو في ورقة). ولكن في غضون ذلك ، لأي أسئلة أو أي شيء آخر ، اتصل بالبريد الإلكتروني (مرتبط بحساب GitHub الخاص بي).
MISC- وغني عن القول أيضًا أن هناك أشياء أخرى لا نهاية لها يمكن إضافتها مثل الوزن القائم على التعقيد ، واسترداد الأمثلة تلقائيًا مع الحقيقة الأرضية الاصطناعية (bootstrapping) لعدد قليل من مطالبات اللقطة ، أو نقاش متعدد الوكلاء على سبيل المثال لا الحصر.

الأعمال ذات الصلة

[1] يتنافس الاتساق الذاتي سلسلة من التفكير في نماذج اللغة

 @inproceedings{
wang2023selfconsistency,
title={Self-Consistency Improves Chain of Thought Reasoning in Language Models},
author={Xuezhi Wang and Jason Wei and Dale Schuurmans and Quoc V Le and Ed H. Chi and Sharan Narang and Aakanksha Chowdhery and Denny Zhou},
booktitle={The Eleventh International Conference on Learning Representations },
year={2023},
url={https://openreview.net/forum?id=1PL1NIMMrw}
}

[2] نماذج اللغة الكبيرة هي أسباب صفرية

 @inproceedings{NEURIPS2022_8bb0d291,
 author = {Kojima, Takeshi and Gu, Shixiang (Shane) and Reid, Machel and Matsuo, Yutaka and Iwasawa, Yusuke},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},
 pages = {22199--22213},
 publisher = {Curran Associates, Inc.},
 title = {Large Language Models are Zero-Shot Reasoners},
 url = {https://proceedings.neurips.cc/paper_files/paper/2022/file/8bb0d291acd4acf06ef112099c16f326-Paper-Conference.pdf},
 volume = {35},
 year = {2022}
}

[3] طالبة الخطة والحل: تحسين تفوق سلسلة من النماذج اللغوية الكبيرة

 @inproceedings{wang-etal-2023-plan,
    title = "Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models",
    author = "Wang, Lei  and
      Xu, Wanyu  and
      Lan, Yihuai  and
      Hu, Zhiqiang  and
      Lan, Yunshi  and
      Lee, Roy Ka-Wei  and
      Lim, Ee-Peng",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.acl-long.147",
    pages = "2609--2634",
    abstract = "Large language models (LLMs) have recently been shown to deliver impressive performance in various NLP tasks. To tackle multi-step reasoning tasks, Few-shot chain-of-thought (CoT) prompting includes a few manually crafted step-by-step reasoning demonstrations which enable LLMs to explicitly generate reasoning steps and improve their reasoning task accuracy. To eliminate the manual efforts, Zero-shot-CoT concatenates the target problem statement with {``}textit{Let{'}s think step by step}{''} as an input prompt to LLMs. Despite the success of Zero-shot-CoT, it still suffers from three pitfalls: calculation errors, missing-step errors, and semantic misunderstanding errors. To address the missing-step errors, we propose Plan-and-Solve (PS) Prompting. It consists of two components: first, devising a plan to divide the entire task into smaller subtasks, and then carrying out the subtasks according to the plan. To address the calculation errors and improve the quality of generated reasoning steps, we extend PS prompting with more detailed instructions and derive PS+ prompting. We evaluate our proposed prompting strategy on ten datasets across three reasoning problems. The experimental results over GPT-3 show that our proposed zero-shot prompting consistently outperforms Zero-shot-CoT across all datasets by a large margin, is comparable to or exceeds Zero-shot-Program-of-Thought Prompting, and has comparable performance with 8-shot CoT prompting on the math reasoning problem. The code can be found at https://github.com/AGI-Edgerunners/Plan-and-Solve-Prompting.",
}

[4] علامة التبويب: سلسلة من الفكر الجدولية صفرية

 @inproceedings{ziqi-lu-2023-tab,
    title = "Tab-{C}o{T}: Zero-shot Tabular Chain of Thought",
    author = "Ziqi, Jin  and
      Lu, Wei",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.findings-acl.651",
    pages = "10259--10277",
    abstract = "The chain-of-though (CoT) prompting methods were successful in various natural language processing (NLP) tasks thanks to their ability to unveil the underlying complex reasoning processes.Such reasoning processes typically exhibit highly structured steps.Recent efforts also started investigating methods to encourage more structured reasoning procedures to be captured (cite least to most).In this work, we propose Tab-CoT, a novel tabular-format CoT prompting method, which allows the complex reasoning process to be explicitly modeled in a highly structured manner.Despite its simplicity, we show that our approach is capable of performing reasoning across multiple dimensions (i.e., both rows and columns).We demonstrate our approach{'}s strong zero-shot and few-shot capabilities through extensive experiments on a range of reasoning tasks.",
}

[5] شجرة الأفكار: حل المشكلات المتعمدة مع نماذج اللغة الكبيرة

 @misc{yao2023tree,
      title={Tree of Thoughts: Deliberate Problem Solving with Large Language Models}, 
      author={Shunyu Yao and Dian Yu and Jeffrey Zhao and Izhak Shafran and Thomas L. Griffiths and Yuan Cao and Karthik Narasimhan},
      year={2023},
      eprint={2305.10601},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

[6] يعزز التحلل المنطق من خلال التقييم الذاتي فك التشفير

 @misc{xie2023decomposition,
      title={Decomposition Enhances Reasoning via Self-Evaluation Guided Decoding}, 
      author={Yuxi Xie and Kenji Kawaguchi and Yiran Zhao and Xu Zhao and Min-Yen Kan and Junxian He and Qizhe Xie},
      year={2023},
      eprint={2305.00633},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

[7] خوارزمية فك تشفيرها بسيطة وسريعة متنوعة للجيل العصبي

 @misc{li2016simple,
      title={A Simple, Fast Diverse Decoding Algorithm for Neural Generation}, 
      author={Jiwei Li and Will Monroe and Dan Jurafsky},
      year={2016},
      eprint={1611.08562},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

[8] التفكير مع نموذج اللغة يخطط لنموذج العالم

 @misc{hao2023reasoning,
      title={Reasoning with Language Model is Planning with World Model}, 
      author={Shibo Hao and Yi Gu and Haodi Ma and Joshua Jiahua Hong and Zhen Wang and Daisy Zhe Wang and Zhiting Hu},
      year={2023},
      eprint={2305.14992},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

يوسع