feat(train_page): 启动 TensorBoard 进程并确保训练结束后终止
- 在训练页面中添加 TensorBoard 进程启动代码 - 创建日志目录并启动 TensorBoard 子进程 - 在训练结束后终止 TensorBoard 进程
This commit is contained in:
parent
4f7926aec6
commit
9298438f98
@ -1,3 +1,4 @@
|
||||
import subprocess
|
||||
import os
|
||||
import gradio as gr
|
||||
import sys
|
||||
@ -55,10 +56,25 @@ def train_page():
|
||||
next_dir_number = max([int(d) for d in existing_dirs], default=0) + 1
|
||||
new_training_dir = os.path.join(training_dir, str(next_dir_number))
|
||||
|
||||
# 启动 TensorBoard 子进程
|
||||
tensorboard_logdir = os.path.join(new_training_dir, "logs")
|
||||
os.makedirs(tensorboard_logdir, exist_ok=True) # 确保日志目录存在
|
||||
tensorboard_process = subprocess.Popen(
|
||||
["tensorboard", "--logdir", tensorboard_logdir, "--port", "6006"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
print("TensorBoard 已启动,日志目录:", tensorboard_logdir)
|
||||
|
||||
try:
|
||||
train_model(get_model(), get_tokenizer(),
|
||||
dataset, new_training_dir,
|
||||
learning_rate, per_device_train_batch_size, epoch,
|
||||
save_steps, lora_rank)
|
||||
finally:
|
||||
# 确保训练结束后终止 TensorBoard 子进程
|
||||
tensorboard_process.terminate()
|
||||
print("TensorBoard 子进程已终止")
|
||||
|
||||
train_button.click(
|
||||
fn=start_training,
|
||||
|
Loading…
x
Reference in New Issue
Block a user