feat(tools): 添加 JSON 数据转换为 dataset 的工具脚本
- 新增 convert_json_to_dataset 函数,用于将 JSON 数据转换为 dataset 对象 - 实现了从 JSON 文件读取数据、转换为 dataset 格式并输出到文件的功能 - 该工具可帮助用户将旧数据集快速转换为新的 dataset 格式
This commit is contained in:
parent
4d8754aad2
commit
84fe78243a
35
tools/convert_json_to_dataset.py
Normal file
35
tools/convert_json_to_dataset.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
from typing import List
|
||||||
|
from schema.dataset import dataset, dataset_item, Q_A
|
||||||
|
import json
|
||||||
|
|
||||||
|
def convert_json_to_dataset(json_data: List[dict]) -> dataset:
|
||||||
|
# 将JSON数据转换为dataset格式
|
||||||
|
dataset_items = []
|
||||||
|
item_id = 1 # 自增ID计数器
|
||||||
|
for item in json_data:
|
||||||
|
qa = Q_A(question=item["question"], answer=item["answer"])
|
||||||
|
dataset_item_obj = dataset_item(id=item_id, message=[qa])
|
||||||
|
dataset_items.append(dataset_item_obj)
|
||||||
|
item_id += 1 # ID自增
|
||||||
|
|
||||||
|
# 创建dataset对象
|
||||||
|
result_dataset = dataset(
|
||||||
|
name="Converted Dataset",
|
||||||
|
model_id=None,
|
||||||
|
description="Dataset converted from JSON",
|
||||||
|
dataset_items=dataset_items
|
||||||
|
)
|
||||||
|
return result_dataset
|
||||||
|
|
||||||
|
# 示例:从文件读取JSON并转换
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 假设JSON数据存储在文件中
|
||||||
|
with open(r"workdir\dataset_old\llamafactory.json", "r", encoding="utf-8") as file:
|
||||||
|
json_data = json.load(file)
|
||||||
|
|
||||||
|
# 转换为dataset格式
|
||||||
|
converted_dataset = convert_json_to_dataset(json_data)
|
||||||
|
|
||||||
|
# 输出结果到文件
|
||||||
|
with open("output.json", "w", encoding="utf-8") as file:
|
||||||
|
file.write(converted_dataset.model_dump_json(indent=4))
|
Loading…
x
Reference in New Issue
Block a user