refactor(train): 重构训练功能并移至新模块
- 将训练逻辑从 train_page.py 移至 tools/model.py - 新增 train_model 函数,包含完整的训练流程 - 更新 train_page.py 中的回调函数,使用新的训练函数 - 移除了 train_page.py 中未使用的导入
This commit is contained in:
parent
bb1d8fbd38
commit
1a2ca3e244
@ -1,23 +1,12 @@
|
|||||||
import unsloth
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import sys
|
import sys
|
||||||
import torch
|
import torch
|
||||||
import pandas as pd
|
|
||||||
from tinydb import Query
|
from tinydb import Query
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datasets import Dataset as HFDataset
|
|
||||||
from transformers import TrainerCallback
|
|
||||||
|
|
||||||
from unsloth import FastLanguageModel
|
|
||||||
from trl import SFTTrainer # 用于监督微调的训练器
|
|
||||||
from transformers import TrainingArguments,DataCollatorForSeq2Seq # 用于配置训练参数
|
|
||||||
from unsloth import is_bfloat16_supported # 检查是否支持bfloat16精度训练
|
|
||||||
from unsloth.chat_templates import get_chat_template, train_on_responses_only
|
|
||||||
from tools import formatting_prompts_func
|
|
||||||
|
|
||||||
sys.path.append(str(Path(__file__).resolve().parent.parent))
|
sys.path.append(str(Path(__file__).resolve().parent.parent))
|
||||||
from global_var import get_model, get_tokenizer, get_datasets, get_workdir
|
from global_var import get_model, get_tokenizer, get_datasets, get_workdir
|
||||||
from tools import formatting_prompts_func
|
from tools import train_model
|
||||||
|
|
||||||
def train_page():
|
def train_page():
|
||||||
with gr.Blocks() as demo:
|
with gr.Blocks() as demo:
|
||||||
@ -46,166 +35,30 @@ def train_page():
|
|||||||
# 训练状态输出
|
# 训练状态输出
|
||||||
output = gr.Textbox(label="训练日志", interactive=False)
|
output = gr.Textbox(label="训练日志", interactive=False)
|
||||||
|
|
||||||
# 添加loss曲线展示
|
def start_training(dataset_name, learning_rate, per_device_train_batch_size, epoch, save_steps, lora_rank):
|
||||||
loss_plot = gr.LinePlot(
|
# 使用动态传入的超参数
|
||||||
x="step",
|
|
||||||
y="loss",
|
|
||||||
title="训练Loss曲线",
|
|
||||||
interactive=True,
|
|
||||||
width=600,
|
|
||||||
height=300
|
|
||||||
)
|
|
||||||
|
|
||||||
def train_model(dataset_name, learning_rate, per_device_train_batch_size, epoch, save_steps, lora_rank):
|
|
||||||
# 使用动态传入的超参数
|
|
||||||
learning_rate = float(learning_rate)
|
learning_rate = float(learning_rate)
|
||||||
per_device_train_batch_size = int(per_device_train_batch_size)
|
per_device_train_batch_size = int(per_device_train_batch_size)
|
||||||
epoch = int(epoch)
|
epoch = int(epoch)
|
||||||
save_steps = int(save_steps) # 新增保存步数参数
|
save_steps = int(save_steps) # 新增保存步数参数
|
||||||
lora_rank = int(lora_rank) # 新增LoRA秩参数
|
lora_rank = int(lora_rank) # 新增LoRA秩参数
|
||||||
|
|
||||||
# 模型配置参数
|
|
||||||
dtype = None # 数据类型,None表示自动选择
|
|
||||||
load_in_4bit = False # 使用4bit量化加载模型以节省显存
|
|
||||||
|
|
||||||
# 加载预训练模型和分词器
|
|
||||||
model = get_model()
|
|
||||||
tokenizer = get_tokenizer()
|
|
||||||
|
|
||||||
model = FastLanguageModel.get_peft_model(
|
|
||||||
# 原始模型
|
|
||||||
model,
|
|
||||||
# LoRA秩,用于控制低秩矩阵的维度,值越大表示可训练参数越多,模型性能可能更好但训练开销更大
|
|
||||||
# 建议: 8-32之间
|
|
||||||
r=lora_rank, # 使用动态传入的LoRA秩
|
|
||||||
# 需要应用LoRA的目标模块列表
|
|
||||||
target_modules=[
|
|
||||||
"q_proj", "k_proj", "v_proj", "o_proj", # attention相关层
|
|
||||||
"gate_proj", "up_proj", "down_proj", # FFN相关层
|
|
||||||
],
|
|
||||||
# LoRA缩放因子,用于控制LoRA更新的幅度。值越大,LoRA的更新影响越大。
|
|
||||||
lora_alpha=16,
|
|
||||||
# LoRA层的dropout率,用于防止过拟合,这里设为0表示不使用dropout。
|
|
||||||
# 如果数据集较小,建议设置0.1左右。
|
|
||||||
lora_dropout=0,
|
|
||||||
# 是否对bias参数进行微调,none表示不微调bias
|
|
||||||
# none: 不微调偏置参数;
|
|
||||||
# all: 微调所有参数;
|
|
||||||
# lora_only: 只微调LoRA参数。
|
|
||||||
bias="none",
|
|
||||||
# 是否使用梯度检查点技术节省显存,使用unsloth优化版本
|
|
||||||
# 会略微降低训练速度,但可以显著减少显存使用
|
|
||||||
use_gradient_checkpointing="unsloth",
|
|
||||||
# 随机数种子,用于结果复现
|
|
||||||
random_state=3407,
|
|
||||||
# 是否使用rank-stabilized LoRA,这里不使用
|
|
||||||
# 会略微降低训练速度,但可以显著减少显存使用
|
|
||||||
use_rslora=False,
|
|
||||||
# LoFTQ配置,这里不使用该量化技术,用于进一步压缩模型大小
|
|
||||||
loftq_config=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
tokenizer = get_chat_template(
|
|
||||||
tokenizer,
|
|
||||||
chat_template="qwen-2.5",
|
|
||||||
)
|
|
||||||
|
|
||||||
# 加载数据集
|
# 加载数据集
|
||||||
dataset = get_datasets().get(Query().name == dataset_name)
|
dataset = get_datasets().get(Query().name == dataset_name)
|
||||||
dataset = [ds["message"][0] for ds in dataset["dataset_items"]]
|
dataset = [ds["message"][0] for ds in dataset["dataset_items"]]
|
||||||
dataset = HFDataset.from_list(dataset)
|
train_model(get_model(), get_tokenizer(), dataset, get_workdir(), learning_rate, per_device_train_batch_size, epoch, save_steps, lora_rank)
|
||||||
dataset = dataset.map(formatting_prompts_func,
|
|
||||||
fn_kwargs={"tokenizer": tokenizer},
|
|
||||||
batched=True)
|
|
||||||
|
|
||||||
# 创建回调类
|
|
||||||
class GradioLossCallback(TrainerCallback):
|
|
||||||
def __init__(self):
|
|
||||||
self.loss_data = []
|
|
||||||
self.log_text = ""
|
|
||||||
self.last_output = {"text": "", "plot": None}
|
|
||||||
|
|
||||||
def on_log(self, args, state, control, logs=None, **kwargs):
|
|
||||||
print(f"on_log called with logs: {logs}") # 调试输出
|
|
||||||
if "loss" in logs:
|
|
||||||
print(f"Recording loss: {logs['loss']} at step {state.global_step}") # 调试输出
|
|
||||||
self.loss_data.append({
|
|
||||||
"step": state.global_step,
|
|
||||||
"loss": float(logs["loss"]) # 确保转换为float
|
|
||||||
})
|
|
||||||
self.log_text += f"Step {state.global_step}: loss={logs['loss']:.4f}\n"
|
|
||||||
df = pd.DataFrame(self.loss_data)
|
|
||||||
print(f"DataFrame created: {df}") # 调试输出
|
|
||||||
self.last_output = {
|
|
||||||
"text": self.log_text,
|
|
||||||
"plot": df
|
|
||||||
}
|
|
||||||
return control
|
|
||||||
|
|
||||||
# 初始化回调
|
|
||||||
callback = GradioLossCallback()
|
|
||||||
|
|
||||||
# 初始化SFT训练器
|
|
||||||
trainer = SFTTrainer(
|
|
||||||
model=model, # 待训练的模型
|
|
||||||
tokenizer=tokenizer, # 分词器
|
|
||||||
train_dataset=dataset, # 训练数据集
|
|
||||||
dataset_text_field="text", # 数据集字段的名称
|
|
||||||
max_seq_length=model.max_seq_length, # 最大序列长度
|
|
||||||
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
|
|
||||||
dataset_num_proc=1, # 数据集处理的并行进程数
|
|
||||||
packing=False,
|
|
||||||
args=TrainingArguments(
|
|
||||||
per_device_train_batch_size=per_device_train_batch_size, # 每个GPU的训练批次大小
|
|
||||||
gradient_accumulation_steps=4, # 梯度累积步数,用于模拟更大的batch size
|
|
||||||
warmup_steps=int(epoch * 0.1), # 预热步数,逐步增加学习率
|
|
||||||
learning_rate=learning_rate, # 学习率
|
|
||||||
lr_scheduler_type="linear", # 线性学习率调度器
|
|
||||||
max_steps=int(epoch * len(dataset)/per_device_train_batch_size), # 最大训练步数(一步 = 处理一个batch的数据)
|
|
||||||
fp16=not is_bfloat16_supported(), # 如果不支持bf16则使用fp16
|
|
||||||
bf16=is_bfloat16_supported(), # 如果支持则使用bf16
|
|
||||||
logging_steps=1, # 每1步记录一次日志
|
|
||||||
optim="adamw_8bit", # 使用8位AdamW优化器节省显存,几乎不影响训练效果
|
|
||||||
weight_decay=0.01, # 权重衰减系数,用于正则化,防止过拟合
|
|
||||||
seed=114514, # 随机数种子
|
|
||||||
output_dir=get_workdir() + "/checkpoint/", # 保存模型检查点和训练日志
|
|
||||||
save_strategy="steps", # 按步保存中间权重
|
|
||||||
save_steps=save_steps, # 使用动态传入的保存步数
|
|
||||||
# report_to="tensorboard", # 将信息输出到tensorboard
|
|
||||||
),
|
|
||||||
)
|
|
||||||
trainer.add_callback(callback)
|
|
||||||
|
|
||||||
trainer = train_on_responses_only(
|
|
||||||
trainer,
|
|
||||||
instruction_part = "<|im_start|>user\n",
|
|
||||||
response_part = "<|im_start|>assistant\n",
|
|
||||||
)
|
|
||||||
|
|
||||||
# 开始训练
|
|
||||||
trainer_stats = trainer.train(resume_from_checkpoint=False)
|
|
||||||
return callback.last_output
|
|
||||||
|
|
||||||
def wrapped_train_model(*args):
|
|
||||||
print("Starting training...") # 调试输出
|
|
||||||
result = train_model(*args)
|
|
||||||
print(f"Training completed with result: {result}") # 调试输出
|
|
||||||
# 确保返回格式正确
|
|
||||||
if result and "text" in result and "plot" in result:
|
|
||||||
return result["text"], result["plot"]
|
|
||||||
return "", pd.DataFrame() # 返回默认值
|
|
||||||
|
|
||||||
train_button.click(
|
train_button.click(
|
||||||
fn=wrapped_train_model,
|
fn=start_training,
|
||||||
inputs=[
|
inputs=[
|
||||||
dataset_dropdown,
|
dataset_dropdown,
|
||||||
learning_rate_input,
|
learning_rate_input,
|
||||||
per_device_train_batch_size_input,
|
per_device_train_batch_size_input,
|
||||||
epoch_input,
|
epoch_input,
|
||||||
save_steps_input,
|
save_steps_input,
|
||||||
lora_rank_input
|
lora_rank_input # 新增lora_rank_input
|
||||||
],
|
],
|
||||||
outputs=[output, loss_plot]
|
outputs=output
|
||||||
)
|
)
|
||||||
|
|
||||||
return demo
|
return demo
|
||||||
|
100
tools/model.py
100
tools/model.py
@ -1,5 +1,13 @@
|
|||||||
import os
|
import os
|
||||||
def formatting_prompts_func(examples,tokenizer):
|
from datasets import Dataset as HFDataset
|
||||||
|
from unsloth import FastLanguageModel
|
||||||
|
from trl import SFTTrainer # 用于监督微调的训练器
|
||||||
|
from transformers import TrainingArguments,DataCollatorForSeq2Seq # 用于配置训练参数
|
||||||
|
from unsloth import is_bfloat16_supported # 检查是否支持bfloat16精度训练
|
||||||
|
from unsloth.chat_templates import get_chat_template, train_on_responses_only
|
||||||
|
def get_model_name(model):
|
||||||
|
return os.path.basename(model.name_or_path)
|
||||||
|
def formatting_prompts(examples,tokenizer):
|
||||||
"""格式化对话数据的函数
|
"""格式化对话数据的函数
|
||||||
Args:
|
Args:
|
||||||
examples: 包含对话列表的字典
|
examples: 包含对话列表的字典
|
||||||
@ -22,5 +30,91 @@ def formatting_prompts_func(examples,tokenizer):
|
|||||||
|
|
||||||
return {"text": texts}
|
return {"text": texts}
|
||||||
|
|
||||||
def get_model_name(model):
|
|
||||||
return os.path.basename(model.name_or_path)
|
def train_model(model, tokenizer, dataset, output_dir, learning_rate,
|
||||||
|
per_device_train_batch_size, epoch, save_steps, lora_rank):
|
||||||
|
# 模型配置参数
|
||||||
|
dtype = None # 数据类型,None表示自动选择
|
||||||
|
load_in_4bit = False # 使用4bit量化加载模型以节省显存
|
||||||
|
|
||||||
|
model = FastLanguageModel.get_peft_model(
|
||||||
|
# 原始模型
|
||||||
|
model,
|
||||||
|
# LoRA秩,用于控制低秩矩阵的维度,值越大表示可训练参数越多,模型性能可能更好但训练开销更大
|
||||||
|
# 建议: 8-32之间
|
||||||
|
r=lora_rank, # 使用动态传入的LoRA秩
|
||||||
|
# 需要应用LoRA的目标模块列表
|
||||||
|
target_modules=[
|
||||||
|
"q_proj", "k_proj", "v_proj", "o_proj", # attention相关层
|
||||||
|
"gate_proj", "up_proj", "down_proj", # FFN相关层
|
||||||
|
],
|
||||||
|
# LoRA缩放因子,用于控制LoRA更新的幅度。值越大,LoRA的更新影响越大。
|
||||||
|
lora_alpha=16,
|
||||||
|
# LoRA层的dropout率,用于防止过拟合,这里设为0表示不使用dropout。
|
||||||
|
# 如果数据集较小,建议设置0.1左右。
|
||||||
|
lora_dropout=0,
|
||||||
|
# 是否对bias参数进行微调,none表示不微调bias
|
||||||
|
# none: 不微调偏置参数;
|
||||||
|
# all: 微调所有参数;
|
||||||
|
# lora_only: 只微调LoRA参数。
|
||||||
|
bias="none",
|
||||||
|
# 是否使用梯度检查点技术节省显存,使用unsloth优化版本
|
||||||
|
# 会略微降低训练速度,但可以显著减少显存使用
|
||||||
|
use_gradient_checkpointing="unsloth",
|
||||||
|
# 随机数种子,用于结果复现
|
||||||
|
random_state=3407,
|
||||||
|
# 是否使用rank-stabilized LoRA,这里不使用
|
||||||
|
# 会略微降低训练速度,但可以显著减少显存使用
|
||||||
|
use_rslora=False,
|
||||||
|
# LoFTQ配置,这里不使用该量化技术,用于进一步压缩模型大小
|
||||||
|
loftq_config=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = get_chat_template(
|
||||||
|
tokenizer,
|
||||||
|
chat_template="qwen-2.5",
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset = HFDataset.from_list(dataset)
|
||||||
|
dataset = dataset.map(formatting_prompts,
|
||||||
|
fn_kwargs={"tokenizer": tokenizer},
|
||||||
|
batched=True)
|
||||||
|
|
||||||
|
# 初始化SFT训练器
|
||||||
|
trainer = SFTTrainer(
|
||||||
|
model=model, # 待训练的模型
|
||||||
|
tokenizer=tokenizer, # 分词器
|
||||||
|
train_dataset=dataset, # 训练数据集
|
||||||
|
dataset_text_field="text", # 数据集字段的名称
|
||||||
|
max_seq_length=model.max_seq_length, # 最大序列长度
|
||||||
|
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
|
||||||
|
dataset_num_proc=1, # 数据集处理的并行进程数
|
||||||
|
packing=False,
|
||||||
|
args=TrainingArguments(
|
||||||
|
per_device_train_batch_size=per_device_train_batch_size, # 每个GPU的训练批次大小
|
||||||
|
gradient_accumulation_steps=4, # 梯度累积步数,用于模拟更大的batch size
|
||||||
|
warmup_steps=int(epoch * 0.1), # 预热步数,逐步增加学习率
|
||||||
|
learning_rate=learning_rate, # 学习率
|
||||||
|
lr_scheduler_type="linear", # 线性学习率调度器
|
||||||
|
max_steps=int(epoch * len(dataset)/per_device_train_batch_size), # 最大训练步数(一步 = 处理一个batch的数据)
|
||||||
|
fp16=not is_bfloat16_supported(), # 如果不支持bf16则使用fp16
|
||||||
|
bf16=is_bfloat16_supported(), # 如果支持则使用bf16
|
||||||
|
logging_steps=1, # 每1步记录一次日志
|
||||||
|
optim="adamw_8bit", # 使用8位AdamW优化器节省显存,几乎不影响训练效果
|
||||||
|
weight_decay=0.01, # 权重衰减系数,用于正则化,防止过拟合
|
||||||
|
seed=114514, # 随机数种子
|
||||||
|
output_dir=output_dir, # 保存模型检查点和训练日志
|
||||||
|
save_strategy="steps", # 按步保存中间权重
|
||||||
|
save_steps=save_steps, # 使用动态传入的保存步数
|
||||||
|
# report_to="tensorboard", # 将信息输出到tensorboard
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
trainer = train_on_responses_only(
|
||||||
|
trainer,
|
||||||
|
instruction_part = "<|im_start|>user\n",
|
||||||
|
response_part = "<|im_start|>assistant\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
# 开始训练
|
||||||
|
trainer_stats = trainer.train(resume_from_checkpoint=False)
|
Loading…
x
Reference in New Issue
Block a user