feat(tools): 添加格式化对话数据的函数
- 新增 formatting_prompts_func 函数,用于格式化对话数据 - 该函数将问题和答案组合成对话形式,并使用 tokenizer.apply_chat_template 进行格式化 - 更新 imports,添加了 unsloth.chat_templates 模块
This commit is contained in:
@@ -1,4 +1,29 @@
|
||||
import os
|
||||
from unsloth.chat_templates import get_chat_template
|
||||
|
||||
def formatting_prompts_func(examples,tokenizer):
|
||||
"""格式化对话数据的函数
|
||||
Args:
|
||||
examples: 包含对话列表的字典
|
||||
Returns:
|
||||
包含格式化文本的字典
|
||||
"""
|
||||
questions = examples["question"]
|
||||
answer = examples["answer"]
|
||||
|
||||
# 将Question和Response组合成对话形式
|
||||
convos = [
|
||||
[{"role": "user", "content": q}, {"role": "assistant", "content": r}]
|
||||
for q, r in zip(questions, answer)
|
||||
]
|
||||
|
||||
# 使用tokenizer.apply_chat_template格式化对话
|
||||
texts = [
|
||||
tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
|
||||
for convo in convos
|
||||
]
|
||||
|
||||
return {"text": texts}
|
||||
|
||||
def get_model_name(model):
|
||||
return os.path.basename(model.name_or_path)
|
Reference in New Issue
Block a user