feat(train_page): 启动 TensorBoard 进程并确保训练结束后终止
- 在训练页面中添加 TensorBoard 进程启动代码 - 创建日志目录并启动 TensorBoard 子进程 - 在训练结束后终止 TensorBoard 进程
This commit is contained in:
parent
4f7926aec6
commit
9298438f98
@ -1,3 +1,4 @@
|
|||||||
|
import subprocess
|
||||||
import os
|
import os
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import sys
|
import sys
|
||||||
@ -55,10 +56,25 @@ def train_page():
|
|||||||
next_dir_number = max([int(d) for d in existing_dirs], default=0) + 1
|
next_dir_number = max([int(d) for d in existing_dirs], default=0) + 1
|
||||||
new_training_dir = os.path.join(training_dir, str(next_dir_number))
|
new_training_dir = os.path.join(training_dir, str(next_dir_number))
|
||||||
|
|
||||||
train_model(get_model(), get_tokenizer(),
|
# 启动 TensorBoard 子进程
|
||||||
dataset, new_training_dir,
|
tensorboard_logdir = os.path.join(new_training_dir, "logs")
|
||||||
learning_rate, per_device_train_batch_size, epoch,
|
os.makedirs(tensorboard_logdir, exist_ok=True) # 确保日志目录存在
|
||||||
save_steps, lora_rank)
|
tensorboard_process = subprocess.Popen(
|
||||||
|
["tensorboard", "--logdir", tensorboard_logdir, "--port", "6006"],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE
|
||||||
|
)
|
||||||
|
print("TensorBoard 已启动,日志目录:", tensorboard_logdir)
|
||||||
|
|
||||||
|
try:
|
||||||
|
train_model(get_model(), get_tokenizer(),
|
||||||
|
dataset, new_training_dir,
|
||||||
|
learning_rate, per_device_train_batch_size, epoch,
|
||||||
|
save_steps, lora_rank)
|
||||||
|
finally:
|
||||||
|
# 确保训练结束后终止 TensorBoard 子进程
|
||||||
|
tensorboard_process.terminate()
|
||||||
|
print("TensorBoard 子进程已终止")
|
||||||
|
|
||||||
train_button.click(
|
train_button.click(
|
||||||
fn=start_training,
|
fn=start_training,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user