Compare commits
2 Commits
1a2ca3e244
...
9fb31c46c8
Author | SHA1 | Date | |
---|---|---|---|
![]() |
9fb31c46c8 | ||
![]() |
4f09823123 |
@ -1,8 +1,8 @@
|
|||||||
import gradio as gr
|
import gradio as gr
|
||||||
import sys
|
import sys
|
||||||
import torch
|
|
||||||
from tinydb import Query
|
from tinydb import Query
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from transformers import TrainerCallback
|
||||||
|
|
||||||
sys.path.append(str(Path(__file__).resolve().parent.parent))
|
sys.path.append(str(Path(__file__).resolve().parent.parent))
|
||||||
from global_var import get_model, get_tokenizer, get_datasets, get_workdir
|
from global_var import get_model, get_tokenizer, get_datasets, get_workdir
|
||||||
@ -45,7 +45,31 @@ def train_page():
|
|||||||
# 加载数据集
|
# 加载数据集
|
||||||
dataset = get_datasets().get(Query().name == dataset_name)
|
dataset = get_datasets().get(Query().name == dataset_name)
|
||||||
dataset = [ds["message"][0] for ds in dataset["dataset_items"]]
|
dataset = [ds["message"][0] for ds in dataset["dataset_items"]]
|
||||||
train_model(get_model(), get_tokenizer(), dataset, get_workdir(), learning_rate, per_device_train_batch_size, epoch, save_steps, lora_rank)
|
|
||||||
|
class LossCallback(TrainerCallback):
|
||||||
|
def __init__(self):
|
||||||
|
self.loss_data = []
|
||||||
|
self.log_text = ""
|
||||||
|
self.last_output = {"text": "", "plot": None}
|
||||||
|
def on_log(self, args, state, control, logs=None, **kwargs):
|
||||||
|
if "loss" in logs:
|
||||||
|
self.loss_data.append({
|
||||||
|
"step": state.global_step,
|
||||||
|
"loss": float(logs["loss"])
|
||||||
|
})
|
||||||
|
self.log_text += f"Step {state.global_step}: loss={logs['loss']:.4f}\n"
|
||||||
|
# 添加以下两行print语句
|
||||||
|
print(f"Current Step: {state.global_step}")
|
||||||
|
print(f"Loss Value: {logs['loss']:.4f}")
|
||||||
|
self.last_output = {
|
||||||
|
"text": self.log_text,
|
||||||
|
}
|
||||||
|
# 不返回 control,避免干预训练过程
|
||||||
|
|
||||||
|
train_model(get_model(), get_tokenizer(),
|
||||||
|
dataset, get_workdir()+"/checkpoint",
|
||||||
|
learning_rate, per_device_train_batch_size, epoch,
|
||||||
|
save_steps, lora_rank, LossCallback)
|
||||||
|
|
||||||
|
|
||||||
train_button.click(
|
train_button.click(
|
||||||
|
@ -31,8 +31,18 @@ def formatting_prompts(examples,tokenizer):
|
|||||||
return {"text": texts}
|
return {"text": texts}
|
||||||
|
|
||||||
|
|
||||||
def train_model(model, tokenizer, dataset, output_dir, learning_rate,
|
def train_model(
|
||||||
per_device_train_batch_size, epoch, save_steps, lora_rank):
|
model,
|
||||||
|
tokenizer,
|
||||||
|
dataset: list,
|
||||||
|
output_dir: str,
|
||||||
|
learning_rate: float,
|
||||||
|
per_device_train_batch_size: int,
|
||||||
|
epoch: int,
|
||||||
|
save_steps: int,
|
||||||
|
lora_rank: int,
|
||||||
|
trainer_callback
|
||||||
|
) -> None:
|
||||||
# 模型配置参数
|
# 模型配置参数
|
||||||
dtype = None # 数据类型,None表示自动选择
|
dtype = None # 数据类型,None表示自动选择
|
||||||
load_in_4bit = False # 使用4bit量化加载模型以节省显存
|
load_in_4bit = False # 使用4bit量化加载模型以节省显存
|
||||||
@ -75,8 +85,8 @@ def train_model(model, tokenizer, dataset, output_dir, learning_rate,
|
|||||||
chat_template="qwen-2.5",
|
chat_template="qwen-2.5",
|
||||||
)
|
)
|
||||||
|
|
||||||
dataset = HFDataset.from_list(dataset)
|
train_dataset = HFDataset.from_list(dataset)
|
||||||
dataset = dataset.map(formatting_prompts,
|
train_dataset = train_dataset.map(formatting_prompts,
|
||||||
fn_kwargs={"tokenizer": tokenizer},
|
fn_kwargs={"tokenizer": tokenizer},
|
||||||
batched=True)
|
batched=True)
|
||||||
|
|
||||||
@ -84,7 +94,7 @@ def train_model(model, tokenizer, dataset, output_dir, learning_rate,
|
|||||||
trainer = SFTTrainer(
|
trainer = SFTTrainer(
|
||||||
model=model, # 待训练的模型
|
model=model, # 待训练的模型
|
||||||
tokenizer=tokenizer, # 分词器
|
tokenizer=tokenizer, # 分词器
|
||||||
train_dataset=dataset, # 训练数据集
|
train_dataset=train_dataset, # 训练数据集
|
||||||
dataset_text_field="text", # 数据集字段的名称
|
dataset_text_field="text", # 数据集字段的名称
|
||||||
max_seq_length=model.max_seq_length, # 最大序列长度
|
max_seq_length=model.max_seq_length, # 最大序列长度
|
||||||
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
|
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
|
||||||
@ -96,7 +106,7 @@ def train_model(model, tokenizer, dataset, output_dir, learning_rate,
|
|||||||
warmup_steps=int(epoch * 0.1), # 预热步数,逐步增加学习率
|
warmup_steps=int(epoch * 0.1), # 预热步数,逐步增加学习率
|
||||||
learning_rate=learning_rate, # 学习率
|
learning_rate=learning_rate, # 学习率
|
||||||
lr_scheduler_type="linear", # 线性学习率调度器
|
lr_scheduler_type="linear", # 线性学习率调度器
|
||||||
max_steps=int(epoch * len(dataset)/per_device_train_batch_size), # 最大训练步数(一步 = 处理一个batch的数据)
|
max_steps=int(epoch * len(train_dataset)/per_device_train_batch_size), # 最大训练步数(一步 = 处理一个batch的数据)
|
||||||
fp16=not is_bfloat16_supported(), # 如果不支持bf16则使用fp16
|
fp16=not is_bfloat16_supported(), # 如果不支持bf16则使用fp16
|
||||||
bf16=is_bfloat16_supported(), # 如果支持则使用bf16
|
bf16=is_bfloat16_supported(), # 如果支持则使用bf16
|
||||||
logging_steps=1, # 每1步记录一次日志
|
logging_steps=1, # 每1步记录一次日志
|
||||||
@ -106,10 +116,12 @@ def train_model(model, tokenizer, dataset, output_dir, learning_rate,
|
|||||||
output_dir=output_dir, # 保存模型检查点和训练日志
|
output_dir=output_dir, # 保存模型检查点和训练日志
|
||||||
save_strategy="steps", # 按步保存中间权重
|
save_strategy="steps", # 按步保存中间权重
|
||||||
save_steps=save_steps, # 使用动态传入的保存步数
|
save_steps=save_steps, # 使用动态传入的保存步数
|
||||||
# report_to="tensorboard", # 将信息输出到tensorboard
|
report_to="none",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
trainer.add_callback(trainer_callback)
|
||||||
|
|
||||||
trainer = train_on_responses_only(
|
trainer = train_on_responses_only(
|
||||||
trainer,
|
trainer,
|
||||||
instruction_part = "<|im_start|>user\n",
|
instruction_part = "<|im_start|>user\n",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user