
- 将 dataset、dataset_item 和 doc 类的首字母大写,以符合 Python 类命名惯例 - 更新相关模块中的导入和引用,以适应新的类名 - 此更改不影响功能,仅提高了代码的一致性和可读性
50 lines
1.6 KiB
Python
50 lines
1.6 KiB
Python
import os
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List
|
|
from tinydb import TinyDB, Query
|
|
from tinydb.storages import MemoryStorage
|
|
|
|
# 将项目根目录添加到系统路径中,以便能够导入项目中的其他模块
|
|
sys.path.append(str(Path(__file__).resolve().parent.parent))
|
|
from schema.dataset import Dataset, DatasetItem, Q_A
|
|
|
|
def get_all_dataset(workdir: str) -> TinyDB:
|
|
"""
|
|
扫描workdir/dataset目录下的所有json文件并读取为dataset对象列表
|
|
|
|
Args:
|
|
workdir (str): 工作目录路径
|
|
|
|
Returns:
|
|
TinyDB: 包含所有数据集对象的TinyDB对象
|
|
"""
|
|
dataset_dir = os.path.join(workdir, "dataset")
|
|
if not os.path.exists(dataset_dir):
|
|
return TinyDB(storage=MemoryStorage)
|
|
|
|
db = TinyDB(storage=MemoryStorage)
|
|
for filename in os.listdir(dataset_dir):
|
|
if filename.endswith(".json"):
|
|
filepath = os.path.join(dataset_dir, filename)
|
|
try:
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
db.insert(data)
|
|
except (json.JSONDecodeError, Exception) as e:
|
|
print(f"Error loading dataset file {filename}: {str(e)}")
|
|
continue
|
|
|
|
return db
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# 定义工作目录路径
|
|
workdir = os.path.join(os.path.dirname(__file__), "..", "workdir")
|
|
# 获取所有数据集
|
|
datasets = get_all_dataset(workdir)
|
|
# 打印结果
|
|
print(f"Found {len(datasets)} datasets:")
|
|
for ds in datasets.all():
|
|
print(f"- {ds['name']} (ID: {ds['id']})") |