feat(train_page): 启动 TensorBoard 进程并确保训练结束后终止

- 在训练页面中添加 TensorBoard 进程启动代码
- 创建日志目录并启动 TensorBoard 子进程
- 在训练结束后终止 TensorBoard 进程
This commit is contained in:
carry 2025-04-14 17:00:33 +08:00
parent 4f7926aec6
commit 9298438f98

View File

@ -1,3 +1,4 @@
import subprocess
import os
import gradio as gr
import sys
@ -55,10 +56,25 @@ def train_page():
next_dir_number = max([int(d) for d in existing_dirs], default=0) + 1
new_training_dir = os.path.join(training_dir, str(next_dir_number))
# 启动 TensorBoard 子进程
tensorboard_logdir = os.path.join(new_training_dir, "logs")
os.makedirs(tensorboard_logdir, exist_ok=True) # 确保日志目录存在
tensorboard_process = subprocess.Popen(
["tensorboard", "--logdir", tensorboard_logdir, "--port", "6006"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
print("TensorBoard 已启动,日志目录:", tensorboard_logdir)
try:
train_model(get_model(), get_tokenizer(),
dataset, new_training_dir,
learning_rate, per_device_train_batch_size, epoch,
save_steps, lora_rank)
finally:
# 确保训练结束后终止 TensorBoard 子进程
tensorboard_process.terminate()
print("TensorBoard 子进程已终止")
train_button.click(
fn=start_training,