gzhu-biyesheji/tools/convert_json_to_dataset.py
carry 84fe78243a feat(tools): 添加 JSON 数据转换为 dataset 的工具脚本
- 新增 convert_json_to_dataset 函数,用于将 JSON 数据转换为 dataset 对象
- 实现了从 JSON 文件读取数据、转换为 dataset 格式并输出到文件的功能
- 该工具可帮助用户将旧数据集快速转换为新的 dataset 格式
2025-04-09 17:31:53 +08:00

35 lines
1.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List
from schema.dataset import dataset, dataset_item, Q_A
import json
def convert_json_to_dataset(json_data: List[dict]) -> dataset:
# 将JSON数据转换为dataset格式
dataset_items = []
item_id = 1 # 自增ID计数器
for item in json_data:
qa = Q_A(question=item["question"], answer=item["answer"])
dataset_item_obj = dataset_item(id=item_id, message=[qa])
dataset_items.append(dataset_item_obj)
item_id += 1 # ID自增
# 创建dataset对象
result_dataset = dataset(
name="Converted Dataset",
model_id=None,
description="Dataset converted from JSON",
dataset_items=dataset_items
)
return result_dataset
# 示例从文件读取JSON并转换
if __name__ == "__main__":
# 假设JSON数据存储在文件中
with open(r"workdir\dataset_old\llamafactory.json", "r", encoding="utf-8") as file:
json_data = json.load(file)
# 转换为dataset格式
converted_dataset = convert_json_to_dataset(json_data)
# 输出结果到文件
with open("output.json", "w", encoding="utf-8") as file:
file.write(converted_dataset.model_dump_json(indent=4))