refactor(parse_markdown): 重构 Markdown 解析逻辑并使用 Pydantic 模型

将 MarkdownNode 类重构为使用 Pydantic 模型,提高代码的可维护性和类型安全性。同时,将解析逻辑与节点操作分离,简化代码结构。
This commit is contained in:
carry 2025-04-04 20:50:39 +08:00
parent 22438d65d3
commit 3395b860e4
3 changed files with 38 additions and 25 deletions

4
schema/__init__.py Normal file
View File

@ -0,0 +1,4 @@
from .dataset import *
from .dataset_generation import APIProvider, LLMResponse, LLMRequest
from .md_doc import MarkdownNode
from .prompt_templeta import prompt_templeta

13
schema/md_doc.py Normal file
View File

@ -0,0 +1,13 @@
from pydantic import BaseModel, Field
from typing import List, Optional
class MarkdownNode(BaseModel):
level: int = Field(default=0, description="节点层级")
title: str = Field(default="Root", description="节点标题")
content: Optional[str] = Field(default=None, description="节点内容")
children: List['MarkdownNode'] = Field(default_factory=list, description="子节点列表")
class Config:
arbitrary_types_allowed = True
MarkdownNode.model_rebuild()

View File

@ -1,28 +1,24 @@
import re
import sys
from pathlib import Path
class MarkdownNode:
def __init__(self, level=0, title="Root"):
self.level = level
self.title = title
self.content = "" # 使用字符串存储合并后的内容
self.children = []
# 添加项目根目录到sys.path
sys.path.append(str(Path(__file__).resolve().parent.parent))
from schema import MarkdownNode
def __repr__(self):
return f"({self.level}) {self.title}"
def add_child(parent, child):
parent.children.append(child)
def add_child(self, child):
self.children.append(child)
def print_tree(self, indent=0):
prefix = "" * (indent - 1) + "└─ " if indent > 0 else ""
print(f"{prefix}{self.title}")
if self.content:
content_prefix = "" * indent + "├─ [内容]"
print(content_prefix)
for line in self.content.split('\n'):
print("" * indent + "" + line)
for child in self.children:
child.print_tree(indent + 1)
def print_tree(node, indent=0):
prefix = "" * (indent - 1) + "└─ " if indent > 0 else ""
print(f"{prefix}{node.title}")
if node.content:
content_prefix = "" * indent + "├─ [内容]"
print(content_prefix)
for line in node.content.split('\n'):
print("" * indent + "" + line)
for child in node.children:
print_tree(child, indent + 1)
def parse_markdown(markdown):
lines = markdown.split('\n')
@ -51,10 +47,10 @@ def parse_markdown(markdown):
if match:
level = len(match.group(1))
title = match.group(2)
node = MarkdownNode(level, title)
node = MarkdownNode(level=level, title=title, content="", children=[])
while stack[-1].level >= level:
stack.pop()
stack[-1].add_child(node)
add_child(stack[-1], node)
stack.append(node)
else:
if stack[-1].content:
@ -65,9 +61,9 @@ def parse_markdown(markdown):
if __name__=="__main__":
# 从文件读取 Markdown 内容
with open("example.md", "r", encoding="utf-8") as f:
with open("workdir/example.md", "r", encoding="utf-8") as f:
markdown = f.read()
# 解析 Markdown 并打印树结构
root = parse_markdown(markdown)
root.print_tree()
print_tree(root)