diff --git a/tools/convert_json_to_dataset.py b/tools/convert_json_to_dataset.py new file mode 100644 index 0000000..3145cdd --- /dev/null +++ b/tools/convert_json_to_dataset.py @@ -0,0 +1,35 @@ +from typing import List +from schema.dataset import dataset, dataset_item, Q_A +import json + +def convert_json_to_dataset(json_data: List[dict]) -> dataset: + # 将JSON数据转换为dataset格式 + dataset_items = [] + item_id = 1 # 自增ID计数器 + for item in json_data: + qa = Q_A(question=item["question"], answer=item["answer"]) + dataset_item_obj = dataset_item(id=item_id, message=[qa]) + dataset_items.append(dataset_item_obj) + item_id += 1 # ID自增 + + # 创建dataset对象 + result_dataset = dataset( + name="Converted Dataset", + model_id=None, + description="Dataset converted from JSON", + dataset_items=dataset_items + ) + return result_dataset + +# 示例:从文件读取JSON并转换 +if __name__ == "__main__": + # 假设JSON数据存储在文件中 + with open(r"workdir\dataset_old\llamafactory.json", "r", encoding="utf-8") as file: + json_data = json.load(file) + + # 转换为dataset格式 + converted_dataset = convert_json_to_dataset(json_data) + + # 输出结果到文件 + with open("output.json", "w", encoding="utf-8") as file: + file.write(converted_dataset.model_dump_json(indent=4)) \ No newline at end of file