From 84fe78243a54ab9a2c2737ff65f128b1c89565cc Mon Sep 17 00:00:00 2001 From: carry <2641257231@qq.com> Date: Wed, 9 Apr 2025 17:31:53 +0800 Subject: [PATCH] =?UTF-8?q?feat(tools):=20=E6=B7=BB=E5=8A=A0=20JSON=20?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E8=BD=AC=E6=8D=A2=E4=B8=BA=20dataset=20?= =?UTF-8?q?=E7=9A=84=E5=B7=A5=E5=85=B7=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 convert_json_to_dataset 函数,用于将 JSON 数据转换为 dataset 对象 - 实现了从 JSON 文件读取数据、转换为 dataset 格式并输出到文件的功能 - 该工具可帮助用户将旧数据集快速转换为新的 dataset 格式 --- tools/convert_json_to_dataset.py | 35 ++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 tools/convert_json_to_dataset.py diff --git a/tools/convert_json_to_dataset.py b/tools/convert_json_to_dataset.py new file mode 100644 index 0000000..3145cdd --- /dev/null +++ b/tools/convert_json_to_dataset.py @@ -0,0 +1,35 @@ +from typing import List +from schema.dataset import dataset, dataset_item, Q_A +import json + +def convert_json_to_dataset(json_data: List[dict]) -> dataset: + # 将JSON数据转换为dataset格式 + dataset_items = [] + item_id = 1 # 自增ID计数器 + for item in json_data: + qa = Q_A(question=item["question"], answer=item["answer"]) + dataset_item_obj = dataset_item(id=item_id, message=[qa]) + dataset_items.append(dataset_item_obj) + item_id += 1 # ID自增 + + # 创建dataset对象 + result_dataset = dataset( + name="Converted Dataset", + model_id=None, + description="Dataset converted from JSON", + dataset_items=dataset_items + ) + return result_dataset + +# 示例:从文件读取JSON并转换 +if __name__ == "__main__": + # 假设JSON数据存储在文件中 + with open(r"workdir\dataset_old\llamafactory.json", "r", encoding="utf-8") as file: + json_data = json.load(file) + + # 转换为dataset格式 + converted_dataset = convert_json_to_dataset(json_data) + + # 输出结果到文件 + with open("output.json", "w", encoding="utf-8") as file: + file.write(converted_dataset.model_dump_json(indent=4)) \ No newline at end of file