gzhu-biyesheji/tools/parse_markdown.py

import re
import sys
from pathlib import Path

# 添加项目根目录到sys.path
sys.path.append(str(Path(__file__).resolve().parent.parent))
from schema import MarkdownNode

def process_markdown_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    root = parse_markdown(content)
    results = []

    def traverse(node, parent_titles):
        current_titles = parent_titles.copy()
        current_titles.append(node.title)

        if not node.children:  # 叶子节点
            if node.content:
                full_text = ' -> '.join(current_titles) + '\n' + node.content
                results.append(full_text)
        else:
            for child in node.children:
                traverse(child, current_titles)

    traverse(root, [])
    return results
def add_child(parent, child):
    parent.children.append(child)

def print_tree(node, indent=0):
    prefix = "│  " * (indent - 1) + "└─ " if indent > 0 else ""
    print(f"{prefix}{node.title}")
    if node.content:
        content_prefix = "│  " * indent + "├─ [内容]"
        print(content_prefix)
        for line in node.content.split('\n'):
            print("│  " * indent + "│  " + line)
    for child in node.children:
        print_tree(child, indent + 1)

def parse_markdown(markdown):
    lines = markdown.split('\n')
    root = MarkdownNode()
    stack = [root]

    in_code_block = False
    for line in lines:
        if line.strip() == "":
            continue

        # 检测代码块开始/结束
        if line.strip().startswith("```") or line.strip().startswith("~~~"):
            in_code_block = not in_code_block
            continue

        # 如果当前在代码块中，直接作为内容处理
        if in_code_block:
            if stack[-1].content:
                stack[-1].content += '\n'
            stack[-1].content += line
            continue

        # 处理标题
        match = re.match(r'^(#+)\s*(.*)', line)
        if match:
            level = len(match.group(1))
            title = match.group(2)
            node = MarkdownNode(level=level, title=title, content="", children=[])
            while stack[-1].level >= level:
                stack.pop()
            add_child(stack[-1], node)
            stack.append(node)
        else:
            if stack[-1].content:
                stack[-1].content += '\n'
            stack[-1].content += line

    return root

if __name__=="__main__":
    # # 从文件读取 Markdown 内容
    # with open("workdir/example.md", "r", encoding="utf-8") as f:
    #     markdown = f.read()

    # # 解析 Markdown 并打印树结构
    # root = parse_markdown(markdown)
    # print_tree(root)
    for i in process_markdown_file("workdir/example.md"):
        print("~"*20)
        print(i)