From 541d37c674de9d4f30d39e516e74228d6feeab51 Mon Sep 17 00:00:00 2001 From: carry <2641257231@qq.com> Date: Wed, 9 Apr 2025 13:02:18 +0800 Subject: [PATCH] =?UTF-8?q?feat(schema):=20=E6=96=B0=E5=A2=9E=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E9=9B=86=E7=9B=B8=E5=85=B3=E6=A8=A1=E5=9E=8B=E5=B9=B6?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=96=87=E6=A1=A3=E6=89=AB=E6=8F=8F=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 dataset.py 文件,定义数据集相关模型 - 新增 tools 目录,包含解析 Markdown 和扫描文档的功能 - 修改 parse_markdown.py,增加处理 Markdown 文件的函数 - 新增 scan_doc_dir.py,实现文档目录扫描功能 --- schema/dataset.py | 28 ++++++++++++++++++++++++++++ tools/__init__.py | 2 ++ tools/parse_markdown.py | 36 ++++++++++++++++++++++++++++++------ tools/scan_doc_dir.py | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 schema/dataset.py create mode 100644 tools/__init__.py create mode 100644 tools/scan_doc_dir.py diff --git a/schema/dataset.py b/schema/dataset.py new file mode 100644 index 0000000..c60ffed --- /dev/null +++ b/schema/dataset.py @@ -0,0 +1,28 @@ +from typing import Optional +from pydantic import BaseModel, Field +from datetime import datetime, timezone + +class doc(BaseModel): + id: Optional[int] = Field(default=None, description="文档ID") + name: str = Field(default="", description="文档名称") + path: str = Field(default="", description="文档路径") + markdown_files: list[str] = Field(default_factory=list, description="文档路径列表") + +class Q_A(BaseModel): + question: str = Field(default="", min_length=1,description="问题") + answer: str = Field(default="", min_length=1, description="答案") + +class dataset_item(BaseModel): + id: Optional[int] = Field(default=None, description="数据集项ID") + message: list[Q_A] = Field(description="数据集项内容") + +class dataset(BaseModel): + id: Optional[int] = Field(default=None, description="数据集ID") + name: Optional[str] = Field(default=None, description="数据集名称") + model_id: Optional[list[str]] = Field(default=None, description="数据集使用的模型ID") + description: Optional[str] = Field(default="", description="数据集描述") + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + description="记录创建时间" + ) + dataset_items: list[dataset_item] = Field(default_factory=list, description="数据集项列表") \ No newline at end of file diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..1c1e383 --- /dev/null +++ b/tools/__init__.py @@ -0,0 +1,2 @@ +from .parse_markdown import parse_markdown +from .scan_doc_dir import * \ No newline at end of file diff --git a/tools/parse_markdown.py b/tools/parse_markdown.py index 6eeba01..92dae9e 100644 --- a/tools/parse_markdown.py +++ b/tools/parse_markdown.py @@ -6,6 +6,27 @@ from pathlib import Path sys.path.append(str(Path(__file__).resolve().parent.parent)) from schema import MarkdownNode +def process_markdown_file(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + root = parse_markdown(content) + results = [] + + def traverse(node, parent_titles): + current_titles = parent_titles.copy() + current_titles.append(node.title) + + if not node.children: # 叶子节点 + if node.content: + full_text = ' -> '.join(current_titles) + '\n' + node.content + results.append(full_text) + else: + for child in node.children: + traverse(child, current_titles) + + traverse(root, []) + return results def add_child(parent, child): parent.children.append(child) @@ -60,10 +81,13 @@ def parse_markdown(markdown): return root if __name__=="__main__": - # 从文件读取 Markdown 内容 - with open("workdir/example.md", "r", encoding="utf-8") as f: - markdown = f.read() + # # 从文件读取 Markdown 内容 + # with open("workdir/example.md", "r", encoding="utf-8") as f: + # markdown = f.read() - # 解析 Markdown 并打印树结构 - root = parse_markdown(markdown) - print_tree(root) + # # 解析 Markdown 并打印树结构 + # root = parse_markdown(markdown) + # print_tree(root) + for i in process_markdown_file("workdir/example.md"): + print("~"*20) + print(i) \ No newline at end of file diff --git a/tools/scan_doc_dir.py b/tools/scan_doc_dir.py new file mode 100644 index 0000000..bd17d1b --- /dev/null +++ b/tools/scan_doc_dir.py @@ -0,0 +1,32 @@ +import sys +import os +from pathlib import Path + +# 添加项目根目录到sys.path +sys.path.append(str(Path(__file__).resolve().parent.parent)) +from schema import doc + +def scan_docs_directory(workdir: str): + docs_dir = os.path.join(workdir, "docs") + + doc_list = os.listdir(docs_dir) + + to_return = [] + + for doc_name in doc_list: + doc_path = os.path.join(docs_dir, doc_name) + if os.path.isdir(doc_path): + markdown_files = [] + for root, dirs, files in os.walk(doc_path): + for file in files: + if file.endswith(".md"): + markdown_files.append(os.path.join(root, file)) + to_return.append(doc(name=doc_name, path=doc_path, markdown_files=markdown_files)) + + return to_return + +# 添加测试代码 +if __name__ == "__main__": + workdir = os.path.join(os.path.dirname(__file__), "..", "workdir") + docs = scan_docs_directory(workdir) + print(docs)