Compare commits

..

No commits in common. "9fb31c46c8906516789a57788465828c309c3c49" and "1a2ca3e2440d5c21da472f369656246ee7654985" have entirely different histories.

2 changed files with 9 additions and 45 deletions

View File

@ -1,8 +1,8 @@
import gradio as gr import gradio as gr
import sys import sys
import torch
from tinydb import Query from tinydb import Query
from pathlib import Path from pathlib import Path
from transformers import TrainerCallback
sys.path.append(str(Path(__file__).resolve().parent.parent)) sys.path.append(str(Path(__file__).resolve().parent.parent))
from global_var import get_model, get_tokenizer, get_datasets, get_workdir from global_var import get_model, get_tokenizer, get_datasets, get_workdir
@ -45,31 +45,7 @@ def train_page():
# 加载数据集 # 加载数据集
dataset = get_datasets().get(Query().name == dataset_name) dataset = get_datasets().get(Query().name == dataset_name)
dataset = [ds["message"][0] for ds in dataset["dataset_items"]] dataset = [ds["message"][0] for ds in dataset["dataset_items"]]
train_model(get_model(), get_tokenizer(), dataset, get_workdir(), learning_rate, per_device_train_batch_size, epoch, save_steps, lora_rank)
class LossCallback(TrainerCallback):
def __init__(self):
self.loss_data = []
self.log_text = ""
self.last_output = {"text": "", "plot": None}
def on_log(self, args, state, control, logs=None, **kwargs):
if "loss" in logs:
self.loss_data.append({
"step": state.global_step,
"loss": float(logs["loss"])
})
self.log_text += f"Step {state.global_step}: loss={logs['loss']:.4f}\n"
# 添加以下两行print语句
print(f"Current Step: {state.global_step}")
print(f"Loss Value: {logs['loss']:.4f}")
self.last_output = {
"text": self.log_text,
}
# 不返回 control避免干预训练过程
train_model(get_model(), get_tokenizer(),
dataset, get_workdir()+"/checkpoint",
learning_rate, per_device_train_batch_size, epoch,
save_steps, lora_rank, LossCallback)
train_button.click( train_button.click(

View File

@ -31,18 +31,8 @@ def formatting_prompts(examples,tokenizer):
return {"text": texts} return {"text": texts}
def train_model( def train_model(model, tokenizer, dataset, output_dir, learning_rate,
model, per_device_train_batch_size, epoch, save_steps, lora_rank):
tokenizer,
dataset: list,
output_dir: str,
learning_rate: float,
per_device_train_batch_size: int,
epoch: int,
save_steps: int,
lora_rank: int,
trainer_callback
) -> None:
# 模型配置参数 # 模型配置参数
dtype = None # 数据类型None表示自动选择 dtype = None # 数据类型None表示自动选择
load_in_4bit = False # 使用4bit量化加载模型以节省显存 load_in_4bit = False # 使用4bit量化加载模型以节省显存
@ -85,8 +75,8 @@ def train_model(
chat_template="qwen-2.5", chat_template="qwen-2.5",
) )
train_dataset = HFDataset.from_list(dataset) dataset = HFDataset.from_list(dataset)
train_dataset = train_dataset.map(formatting_prompts, dataset = dataset.map(formatting_prompts,
fn_kwargs={"tokenizer": tokenizer}, fn_kwargs={"tokenizer": tokenizer},
batched=True) batched=True)
@ -94,7 +84,7 @@ def train_model(
trainer = SFTTrainer( trainer = SFTTrainer(
model=model, # 待训练的模型 model=model, # 待训练的模型
tokenizer=tokenizer, # 分词器 tokenizer=tokenizer, # 分词器
train_dataset=train_dataset, # 训练数据集 train_dataset=dataset, # 训练数据集
dataset_text_field="text", # 数据集字段的名称 dataset_text_field="text", # 数据集字段的名称
max_seq_length=model.max_seq_length, # 最大序列长度 max_seq_length=model.max_seq_length, # 最大序列长度
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer), data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
@ -106,7 +96,7 @@ def train_model(
warmup_steps=int(epoch * 0.1), # 预热步数,逐步增加学习率 warmup_steps=int(epoch * 0.1), # 预热步数,逐步增加学习率
learning_rate=learning_rate, # 学习率 learning_rate=learning_rate, # 学习率
lr_scheduler_type="linear", # 线性学习率调度器 lr_scheduler_type="linear", # 线性学习率调度器
max_steps=int(epoch * len(train_dataset)/per_device_train_batch_size), # 最大训练步数(一步 = 处理一个batch的数据 max_steps=int(epoch * len(dataset)/per_device_train_batch_size), # 最大训练步数(一步 = 处理一个batch的数据
fp16=not is_bfloat16_supported(), # 如果不支持bf16则使用fp16 fp16=not is_bfloat16_supported(), # 如果不支持bf16则使用fp16
bf16=is_bfloat16_supported(), # 如果支持则使用bf16 bf16=is_bfloat16_supported(), # 如果支持则使用bf16
logging_steps=1, # 每1步记录一次日志 logging_steps=1, # 每1步记录一次日志
@ -116,12 +106,10 @@ def train_model(
output_dir=output_dir, # 保存模型检查点和训练日志 output_dir=output_dir, # 保存模型检查点和训练日志
save_strategy="steps", # 按步保存中间权重 save_strategy="steps", # 按步保存中间权重
save_steps=save_steps, # 使用动态传入的保存步数 save_steps=save_steps, # 使用动态传入的保存步数
report_to="none", # report_to="tensorboard", # 将信息输出到tensorboard
), ),
) )
trainer.add_callback(trainer_callback)
trainer = train_on_responses_only( trainer = train_on_responses_only(
trainer, trainer,
instruction_part = "<|im_start|>user\n", instruction_part = "<|im_start|>user\n",