diff --git a/frontend/train_page.py b/frontend/train_page.py index 56d0ea7..c2e1d7c 100644 --- a/frontend/train_page.py +++ b/frontend/train_page.py @@ -1,3 +1,4 @@ +import subprocess import os import gradio as gr import sys @@ -55,10 +56,25 @@ def train_page(): next_dir_number = max([int(d) for d in existing_dirs], default=0) + 1 new_training_dir = os.path.join(training_dir, str(next_dir_number)) - train_model(get_model(), get_tokenizer(), - dataset, new_training_dir, - learning_rate, per_device_train_batch_size, epoch, - save_steps, lora_rank) + # 启动 TensorBoard 子进程 + tensorboard_logdir = os.path.join(new_training_dir, "logs") + os.makedirs(tensorboard_logdir, exist_ok=True) # 确保日志目录存在 + tensorboard_process = subprocess.Popen( + ["tensorboard", "--logdir", tensorboard_logdir, "--port", "6006"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + print("TensorBoard 已启动,日志目录:", tensorboard_logdir) + + try: + train_model(get_model(), get_tokenizer(), + dataset, new_training_dir, + learning_rate, per_device_train_batch_size, epoch, + save_steps, lora_rank) + finally: + # 确保训练结束后终止 TensorBoard 子进程 + tensorboard_process.terminate() + print("TensorBoard 子进程已终止") train_button.click( fn=start_training,