refactor(parse_markdown): 重构 Markdown 解析逻辑并使用 Pydantic 模型

将 MarkdownNode 类重构为使用 Pydantic 模型,提高代码的可维护性和类型安全性。同时,将解析逻辑与节点操作分离,简化代码结构。
This commit is contained in:
carry 2025-04-04 20:50:39 +08:00
parent 22438d65d3
commit 3395b860e4
3 changed files with 38 additions and 25 deletions

4
schema/__init__.py Normal file
View File

@ -0,0 +1,4 @@
from .dataset import *
from .dataset_generation import APIProvider, LLMResponse, LLMRequest
from .md_doc import MarkdownNode
from .prompt_templeta import prompt_templeta

13
schema/md_doc.py Normal file
View File

@ -0,0 +1,13 @@
from pydantic import BaseModel, Field
from typing import List, Optional
class MarkdownNode(BaseModel):
level: int = Field(default=0, description="节点层级")
title: str = Field(default="Root", description="节点标题")
content: Optional[str] = Field(default=None, description="节点内容")
children: List['MarkdownNode'] = Field(default_factory=list, description="子节点列表")
class Config:
arbitrary_types_allowed = True
MarkdownNode.model_rebuild()

View File

@ -1,28 +1,24 @@
import re import re
import sys
from pathlib import Path
class MarkdownNode: # 添加项目根目录到sys.path
def __init__(self, level=0, title="Root"): sys.path.append(str(Path(__file__).resolve().parent.parent))
self.level = level from schema import MarkdownNode
self.title = title
self.content = "" # 使用字符串存储合并后的内容
self.children = []
def __repr__(self): def add_child(parent, child):
return f"({self.level}) {self.title}" parent.children.append(child)
def add_child(self, child): def print_tree(node, indent=0):
self.children.append(child) prefix = "" * (indent - 1) + "└─ " if indent > 0 else ""
print(f"{prefix}{node.title}")
def print_tree(self, indent=0): if node.content:
prefix = "" * (indent - 1) + "└─ " if indent > 0 else "" content_prefix = "" * indent + "├─ [内容]"
print(f"{prefix}{self.title}") print(content_prefix)
if self.content: for line in node.content.split('\n'):
content_prefix = "" * indent + "├─ [内容]" print("" * indent + "" + line)
print(content_prefix) for child in node.children:
for line in self.content.split('\n'): print_tree(child, indent + 1)
print("" * indent + "" + line)
for child in self.children:
child.print_tree(indent + 1)
def parse_markdown(markdown): def parse_markdown(markdown):
lines = markdown.split('\n') lines = markdown.split('\n')
@ -51,10 +47,10 @@ def parse_markdown(markdown):
if match: if match:
level = len(match.group(1)) level = len(match.group(1))
title = match.group(2) title = match.group(2)
node = MarkdownNode(level, title) node = MarkdownNode(level=level, title=title, content="", children=[])
while stack[-1].level >= level: while stack[-1].level >= level:
stack.pop() stack.pop()
stack[-1].add_child(node) add_child(stack[-1], node)
stack.append(node) stack.append(node)
else: else:
if stack[-1].content: if stack[-1].content:
@ -65,9 +61,9 @@ def parse_markdown(markdown):
if __name__=="__main__": if __name__=="__main__":
# 从文件读取 Markdown 内容 # 从文件读取 Markdown 内容
with open("example.md", "r", encoding="utf-8") as f: with open("workdir/example.md", "r", encoding="utf-8") as f:
markdown = f.read() markdown = f.read()
# 解析 Markdown 并打印树结构 # 解析 Markdown 并打印树结构
root = parse_markdown(markdown) root = parse_markdown(markdown)
root.print_tree() print_tree(root)