
- 将 dataset、dataset_item 和 doc 类的首字母大写,以符合 Python 类命名惯例 - 更新相关模块中的导入和引用,以适应新的类名 - 此更改不影响功能,仅提高了代码的一致性和可读性
35 lines
1.2 KiB
Python
35 lines
1.2 KiB
Python
from typing import List
|
||
from schema.dataset import Dataset, DatasetItem, Q_A
|
||
import json
|
||
|
||
def convert_json_to_dataset(json_data: List[dict]) -> Dataset:
|
||
# 将JSON数据转换为dataset格式
|
||
dataset_items = []
|
||
item_id = 1 # 自增ID计数器
|
||
for item in json_data:
|
||
qa = Q_A(question=item["question"], answer=item["answer"])
|
||
dataset_item_obj = DatasetItem(id=item_id, message=[qa])
|
||
dataset_items.append(dataset_item_obj)
|
||
item_id += 1 # ID自增
|
||
|
||
# 创建dataset对象
|
||
result_dataset = Dataset(
|
||
name="Converted Dataset",
|
||
model_id=None,
|
||
description="Dataset converted from JSON",
|
||
dataset_items=dataset_items
|
||
)
|
||
return result_dataset
|
||
|
||
# 示例:从文件读取JSON并转换
|
||
if __name__ == "__main__":
|
||
# 假设JSON数据存储在文件中
|
||
with open(r"workdir\dataset_old\llamafactory.json", "r", encoding="utf-8") as file:
|
||
json_data = json.load(file)
|
||
|
||
# 转换为dataset格式
|
||
converted_dataset = convert_json_to_dataset(json_data)
|
||
|
||
# 输出结果到文件
|
||
with open("output.json", "w", encoding="utf-8") as file:
|
||
file.write(converted_dataset.model_dump_json(indent=4)) |