gzhu-biyesheji/tools/parse_markdown.py

import re
import sys
from pathlib import Path

# 添加项目根目录到sys.path
sys.path.append(str(Path(__file__).resolve().parent.parent))
from schema import MarkdownNode

def add_child(parent, child):
    parent.children.append(child)

def print_tree(node, indent=0):
    prefix = "│  " * (indent - 1) + "└─ " if indent > 0 else ""
    print(f"{prefix}{node.title}")
    if node.content:
        content_prefix = "│  " * indent + "├─ [内容]"
        print(content_prefix)
        for line in node.content.split('\n'):
            print("│  " * indent + "│  " + line)
    for child in node.children:
        print_tree(child, indent + 1)

def parse_markdown(markdown):
    lines = markdown.split('\n')
    root = MarkdownNode()
    stack = [root]

    in_code_block = False
    for line in lines:
        if line.strip() == "":
            continue

        # 检测代码块开始/结束
        if line.strip().startswith("```") or line.strip().startswith("~~~"):
            in_code_block = not in_code_block
            continue

        # 如果当前在代码块中，直接作为内容处理
        if in_code_block:
            if stack[-1].content:
                stack[-1].content += '\n'
            stack[-1].content += line
            continue

        # 处理标题
        match = re.match(r'^(#+)\s*(.*)', line)
        if match:
            level = len(match.group(1))
            title = match.group(2)
            node = MarkdownNode(level=level, title=title, content="", children=[])
            while stack[-1].level >= level:
                stack.pop()
            add_child(stack[-1], node)
            stack.append(node)
        else:
            if stack[-1].content:
                stack[-1].content += '\n'
            stack[-1].content += line

    return root

if __name__=="__main__":
    # 从文件读取 Markdown 内容
    with open("workdir/example.md", "r", encoding="utf-8") as f:
        markdown = f.read()

    # 解析 Markdown 并打印树结构
    root = parse_markdown(markdown)
    print_tree(root)