70 lines
2.2 KiB
Python
70 lines
2.2 KiB
Python
from unsloth import FastLanguageModel
|
|
import torch
|
|
|
|
# 基础配置参数
|
|
max_seq_length = 4096 # 最大序列长度
|
|
dtype = None # 自动检测数据类型
|
|
load_in_4bit = True # 使用4位量化以减少内存使用
|
|
|
|
# 加载预训练模型和分词器
|
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
model_name = "workdir\model\Qwen2.5-3B-Instruct-bnb-4bit", # 选择Qwen2.5 32B指令模型
|
|
max_seq_length = max_seq_length,
|
|
dtype = dtype,
|
|
load_in_4bit = load_in_4bit,
|
|
)
|
|
|
|
model = FastLanguageModel.get_peft_model(
|
|
model,
|
|
r = 64, # LoRA秩,控制可训练参数数量
|
|
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
|
|
"gate_proj", "up_proj", "down_proj",], # 需要训练的目标模块
|
|
lora_alpha = 64, # LoRA缩放因子
|
|
lora_dropout = 0, # LoRA dropout率
|
|
bias = "none", # 是否训练偏置项
|
|
use_gradient_checkpointing = "unsloth", # 使用梯度检查点节省显存
|
|
random_state = 114514, # 随机数种子
|
|
use_rslora = False, # 是否使用稳定版LoRA
|
|
loftq_config = None, # LoftQ配置
|
|
)
|
|
|
|
from unsloth.chat_templates import get_chat_template
|
|
# 配置分词器使用qwen-2.5对话模板
|
|
tokenizer = get_chat_template(
|
|
tokenizer,
|
|
chat_template="qwen-2.5",
|
|
)
|
|
|
|
def formatting_prompts_func(examples):
|
|
"""格式化对话数据的函数
|
|
Args:
|
|
examples: 包含对话列表的字典
|
|
Returns:
|
|
包含格式化文本的字典
|
|
"""
|
|
questions = examples["question"]
|
|
answer = examples["answer"]
|
|
|
|
# 将Question和Response组合成对话形式
|
|
convos = [
|
|
[{"role": "user", "content": q}, {"role": "assistant", "content": r}]
|
|
for q, r in zip(questions, answer)
|
|
]
|
|
|
|
# 使用tokenizer.apply_chat_template格式化对话
|
|
texts = [
|
|
tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
|
|
for convo in convos
|
|
]
|
|
|
|
return {"text": texts}
|
|
|
|
from unsloth.chat_templates import standardize_sharegpt
|
|
|
|
# 加载数据集
|
|
from datasets import load_dataset
|
|
dataset = load_dataset("json", data_files="workdir\dataset\dataset.json")
|
|
dataset = dataset.map(formatting_prompts_func, batched = True)
|
|
|
|
print(dataset[5])
|
|
print(dataset[5]["text"]) |