From 9298438f98599a9db6842ea0bed7e100b5de2748 Mon Sep 17 00:00:00 2001 From: carry Date: Mon, 14 Apr 2025 17:00:33 +0800 Subject: [PATCH] =?UTF-8?q?feat(train=5Fpage):=20=E5=90=AF=E5=8A=A8=20Tens?= =?UTF-8?q?orBoard=20=E8=BF=9B=E7=A8=8B=E5=B9=B6=E7=A1=AE=E4=BF=9D?= =?UTF-8?q?=E8=AE=AD=E7=BB=83=E7=BB=93=E6=9D=9F=E5=90=8E=E7=BB=88=E6=AD=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在训练页面中添加 TensorBoard 进程启动代码 - 创建日志目录并启动 TensorBoard 子进程 - 在训练结束后终止 TensorBoard 进程 --- frontend/train_page.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/frontend/train_page.py b/frontend/train_page.py index 56d0ea7..c2e1d7c 100644 --- a/frontend/train_page.py +++ b/frontend/train_page.py @@ -1,3 +1,4 @@ +import subprocess import os import gradio as gr import sys @@ -55,10 +56,25 @@ def train_page(): next_dir_number = max([int(d) for d in existing_dirs], default=0) + 1 new_training_dir = os.path.join(training_dir, str(next_dir_number)) - train_model(get_model(), get_tokenizer(), - dataset, new_training_dir, - learning_rate, per_device_train_batch_size, epoch, - save_steps, lora_rank) + # 启动 TensorBoard 子进程 + tensorboard_logdir = os.path.join(new_training_dir, "logs") + os.makedirs(tensorboard_logdir, exist_ok=True) # 确保日志目录存在 + tensorboard_process = subprocess.Popen( + ["tensorboard", "--logdir", tensorboard_logdir, "--port", "6006"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + print("TensorBoard 已启动,日志目录:", tensorboard_logdir) + + try: + train_model(get_model(), get_tokenizer(), + dataset, new_training_dir, + learning_rate, per_device_train_batch_size, epoch, + save_steps, lora_rank) + finally: + # 确保训练结束后终止 TensorBoard 子进程 + tensorboard_process.terminate() + print("TensorBoard 子进程已终止") train_button.click( fn=start_training,