From 3395b860e4539413add8cb2e1d01189e3671008d Mon Sep 17 00:00:00 2001 From: carry <2641257231@qq.com> Date: Fri, 4 Apr 2025 20:50:39 +0800 Subject: [PATCH] =?UTF-8?q?refactor(parse=5Fmarkdown):=20=E9=87=8D?= =?UTF-8?q?=E6=9E=84=20Markdown=20=E8=A7=A3=E6=9E=90=E9=80=BB=E8=BE=91?= =?UTF-8?q?=E5=B9=B6=E4=BD=BF=E7=94=A8=20Pydantic=20=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将 MarkdownNode 类重构为使用 Pydantic 模型,提高代码的可维护性和类型安全性。同时,将解析逻辑与节点操作分离,简化代码结构。 --- schema/__init__.py | 4 ++++ schema/md_doc.py | 13 ++++++++++++ tools/parse_markdown.py | 46 +++++++++++++++++++---------------------- 3 files changed, 38 insertions(+), 25 deletions(-) create mode 100644 schema/__init__.py create mode 100644 schema/md_doc.py diff --git a/schema/__init__.py b/schema/__init__.py new file mode 100644 index 0000000..909ccd1 --- /dev/null +++ b/schema/__init__.py @@ -0,0 +1,4 @@ +from .dataset import * +from .dataset_generation import APIProvider, LLMResponse, LLMRequest +from .md_doc import MarkdownNode +from .prompt_templeta import prompt_templeta \ No newline at end of file diff --git a/schema/md_doc.py b/schema/md_doc.py new file mode 100644 index 0000000..d67d6dd --- /dev/null +++ b/schema/md_doc.py @@ -0,0 +1,13 @@ +from pydantic import BaseModel, Field +from typing import List, Optional + +class MarkdownNode(BaseModel): + level: int = Field(default=0, description="节点层级") + title: str = Field(default="Root", description="节点标题") + content: Optional[str] = Field(default=None, description="节点内容") + children: List['MarkdownNode'] = Field(default_factory=list, description="子节点列表") + + class Config: + arbitrary_types_allowed = True + +MarkdownNode.model_rebuild() diff --git a/tools/parse_markdown.py b/tools/parse_markdown.py index b9f41ea..6eeba01 100644 --- a/tools/parse_markdown.py +++ b/tools/parse_markdown.py @@ -1,28 +1,24 @@ import re +import sys +from pathlib import Path -class MarkdownNode: - def __init__(self, level=0, title="Root"): - self.level = level - self.title = title - self.content = "" # 使用字符串存储合并后的内容 - self.children = [] +# 添加项目根目录到sys.path +sys.path.append(str(Path(__file__).resolve().parent.parent)) +from schema import MarkdownNode - def __repr__(self): - return f"({self.level}) {self.title}" +def add_child(parent, child): + parent.children.append(child) - def add_child(self, child): - self.children.append(child) - - def print_tree(self, indent=0): - prefix = "│ " * (indent - 1) + "└─ " if indent > 0 else "" - print(f"{prefix}{self.title}") - if self.content: - content_prefix = "│ " * indent + "├─ [内容]" - print(content_prefix) - for line in self.content.split('\n'): - print("│ " * indent + "│ " + line) - for child in self.children: - child.print_tree(indent + 1) +def print_tree(node, indent=0): + prefix = "│ " * (indent - 1) + "└─ " if indent > 0 else "" + print(f"{prefix}{node.title}") + if node.content: + content_prefix = "│ " * indent + "├─ [内容]" + print(content_prefix) + for line in node.content.split('\n'): + print("│ " * indent + "│ " + line) + for child in node.children: + print_tree(child, indent + 1) def parse_markdown(markdown): lines = markdown.split('\n') @@ -51,10 +47,10 @@ def parse_markdown(markdown): if match: level = len(match.group(1)) title = match.group(2) - node = MarkdownNode(level, title) + node = MarkdownNode(level=level, title=title, content="", children=[]) while stack[-1].level >= level: stack.pop() - stack[-1].add_child(node) + add_child(stack[-1], node) stack.append(node) else: if stack[-1].content: @@ -65,9 +61,9 @@ def parse_markdown(markdown): if __name__=="__main__": # 从文件读取 Markdown 内容 - with open("example.md", "r", encoding="utf-8") as f: + with open("workdir/example.md", "r", encoding="utf-8") as f: markdown = f.read() # 解析 Markdown 并打印树结构 root = parse_markdown(markdown) - root.print_tree() + print_tree(root)