gzhu-biyesheji/tools/convert_json_to_dataset.py
carry 4c9caff668 refactor(schema): 重构数据集和文档类的命名
- 将 dataset、dataset_item 和 doc 类的首字母大写,以符合 Python 类命名惯例
- 更新相关模块中的导入和引用,以适应新的类名
- 此更改不影响功能,仅提高了代码的一致性和可读性
2025-04-20 01:46:15 +08:00

35 lines
1.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List
from schema.dataset import Dataset, DatasetItem, Q_A
import json
def convert_json_to_dataset(json_data: List[dict]) -> Dataset:
# 将JSON数据转换为dataset格式
dataset_items = []
item_id = 1 # 自增ID计数器
for item in json_data:
qa = Q_A(question=item["question"], answer=item["answer"])
dataset_item_obj = DatasetItem(id=item_id, message=[qa])
dataset_items.append(dataset_item_obj)
item_id += 1 # ID自增
# 创建dataset对象
result_dataset = Dataset(
name="Converted Dataset",
model_id=None,
description="Dataset converted from JSON",
dataset_items=dataset_items
)
return result_dataset
# 示例从文件读取JSON并转换
if __name__ == "__main__":
# 假设JSON数据存储在文件中
with open(r"workdir\dataset_old\llamafactory.json", "r", encoding="utf-8") as file:
json_data = json.load(file)
# 转换为dataset格式
converted_dataset = convert_json_to_dataset(json_data)
# 输出结果到文件
with open("output.json", "w", encoding="utf-8") as file:
file.write(converted_dataset.model_dump_json(indent=4))