Compare commits

...

2 Commits

Author SHA1 Message Date
carry
9fb31c46c8 feat(train): 添加训练过程中的日志记录和 loss 可视化功能
- 新增 LossCallback 类,用于在训练过程中记录 loss 数据
- 在训练模型函数中添加回调,实现日志记录和 loss 可视化
- 优化训练过程中的输出信息,增加当前步数和 loss 值的打印
2025-04-14 15:18:14 +08:00
carry
4f09823123 refactor(tools): 优化 train_model 函数定义
- 添加类型注解,提高代码可读性和维护性
- 使用多行格式定义函数参数,提升代码格式美观
2025-04-14 14:28:36 +08:00
2 changed files with 45 additions and 9 deletions

View File

@ -1,8 +1,8 @@
import gradio as gr import gradio as gr
import sys import sys
import torch
from tinydb import Query from tinydb import Query
from pathlib import Path from pathlib import Path
from transformers import TrainerCallback
sys.path.append(str(Path(__file__).resolve().parent.parent)) sys.path.append(str(Path(__file__).resolve().parent.parent))
from global_var import get_model, get_tokenizer, get_datasets, get_workdir from global_var import get_model, get_tokenizer, get_datasets, get_workdir
@ -45,7 +45,31 @@ def train_page():
# 加载数据集 # 加载数据集
dataset = get_datasets().get(Query().name == dataset_name) dataset = get_datasets().get(Query().name == dataset_name)
dataset = [ds["message"][0] for ds in dataset["dataset_items"]] dataset = [ds["message"][0] for ds in dataset["dataset_items"]]
train_model(get_model(), get_tokenizer(), dataset, get_workdir(), learning_rate, per_device_train_batch_size, epoch, save_steps, lora_rank)
class LossCallback(TrainerCallback):
def __init__(self):
self.loss_data = []
self.log_text = ""
self.last_output = {"text": "", "plot": None}
def on_log(self, args, state, control, logs=None, **kwargs):
if "loss" in logs:
self.loss_data.append({
"step": state.global_step,
"loss": float(logs["loss"])
})
self.log_text += f"Step {state.global_step}: loss={logs['loss']:.4f}\n"
# 添加以下两行print语句
print(f"Current Step: {state.global_step}")
print(f"Loss Value: {logs['loss']:.4f}")
self.last_output = {
"text": self.log_text,
}
# 不返回 control避免干预训练过程
train_model(get_model(), get_tokenizer(),
dataset, get_workdir()+"/checkpoint",
learning_rate, per_device_train_batch_size, epoch,
save_steps, lora_rank, LossCallback)
train_button.click( train_button.click(

View File

@ -31,8 +31,18 @@ def formatting_prompts(examples,tokenizer):
return {"text": texts} return {"text": texts}
def train_model(model, tokenizer, dataset, output_dir, learning_rate, def train_model(
per_device_train_batch_size, epoch, save_steps, lora_rank): model,
tokenizer,
dataset: list,
output_dir: str,
learning_rate: float,
per_device_train_batch_size: int,
epoch: int,
save_steps: int,
lora_rank: int,
trainer_callback
) -> None:
# 模型配置参数 # 模型配置参数
dtype = None # 数据类型None表示自动选择 dtype = None # 数据类型None表示自动选择
load_in_4bit = False # 使用4bit量化加载模型以节省显存 load_in_4bit = False # 使用4bit量化加载模型以节省显存
@ -75,8 +85,8 @@ def train_model(model, tokenizer, dataset, output_dir, learning_rate,
chat_template="qwen-2.5", chat_template="qwen-2.5",
) )
dataset = HFDataset.from_list(dataset) train_dataset = HFDataset.from_list(dataset)
dataset = dataset.map(formatting_prompts, train_dataset = train_dataset.map(formatting_prompts,
fn_kwargs={"tokenizer": tokenizer}, fn_kwargs={"tokenizer": tokenizer},
batched=True) batched=True)
@ -84,7 +94,7 @@ def train_model(model, tokenizer, dataset, output_dir, learning_rate,
trainer = SFTTrainer( trainer = SFTTrainer(
model=model, # 待训练的模型 model=model, # 待训练的模型
tokenizer=tokenizer, # 分词器 tokenizer=tokenizer, # 分词器
train_dataset=dataset, # 训练数据集 train_dataset=train_dataset, # 训练数据集
dataset_text_field="text", # 数据集字段的名称 dataset_text_field="text", # 数据集字段的名称
max_seq_length=model.max_seq_length, # 最大序列长度 max_seq_length=model.max_seq_length, # 最大序列长度
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer), data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
@ -96,7 +106,7 @@ def train_model(model, tokenizer, dataset, output_dir, learning_rate,
warmup_steps=int(epoch * 0.1), # 预热步数,逐步增加学习率 warmup_steps=int(epoch * 0.1), # 预热步数,逐步增加学习率
learning_rate=learning_rate, # 学习率 learning_rate=learning_rate, # 学习率
lr_scheduler_type="linear", # 线性学习率调度器 lr_scheduler_type="linear", # 线性学习率调度器
max_steps=int(epoch * len(dataset)/per_device_train_batch_size), # 最大训练步数(一步 = 处理一个batch的数据 max_steps=int(epoch * len(train_dataset)/per_device_train_batch_size), # 最大训练步数(一步 = 处理一个batch的数据
fp16=not is_bfloat16_supported(), # 如果不支持bf16则使用fp16 fp16=not is_bfloat16_supported(), # 如果不支持bf16则使用fp16
bf16=is_bfloat16_supported(), # 如果支持则使用bf16 bf16=is_bfloat16_supported(), # 如果支持则使用bf16
logging_steps=1, # 每1步记录一次日志 logging_steps=1, # 每1步记录一次日志
@ -106,10 +116,12 @@ def train_model(model, tokenizer, dataset, output_dir, learning_rate,
output_dir=output_dir, # 保存模型检查点和训练日志 output_dir=output_dir, # 保存模型检查点和训练日志
save_strategy="steps", # 按步保存中间权重 save_strategy="steps", # 按步保存中间权重
save_steps=save_steps, # 使用动态传入的保存步数 save_steps=save_steps, # 使用动态传入的保存步数
# report_to="tensorboard", # 将信息输出到tensorboard report_to="none",
), ),
) )
trainer.add_callback(trainer_callback)
trainer = train_on_responses_only( trainer = train_on_responses_only(
trainer, trainer,
instruction_part = "<|im_start|>user\n", instruction_part = "<|im_start|>user\n",