gzhu-biyesheji/tools/parse_markdown.py
carry 541d37c674 feat(schema): 新增数据集相关模型并添加文档扫描功能
- 新增 dataset.py 文件,定义数据集相关模型
- 新增 tools 目录,包含解析 Markdown 和扫描文档的功能
- 修改 parse_markdown.py,增加处理 Markdown 文件的函数
- 新增 scan_doc_dir.py,实现文档目录扫描功能
2025-04-09 13:02:18 +08:00

93 lines
2.8 KiB
Python

import re
import sys
from pathlib import Path
# 添加项目根目录到sys.path
sys.path.append(str(Path(__file__).resolve().parent.parent))
from schema import MarkdownNode
def process_markdown_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
root = parse_markdown(content)
results = []
def traverse(node, parent_titles):
current_titles = parent_titles.copy()
current_titles.append(node.title)
if not node.children: # 叶子节点
if node.content:
full_text = ' -> '.join(current_titles) + '\n' + node.content
results.append(full_text)
else:
for child in node.children:
traverse(child, current_titles)
traverse(root, [])
return results
def add_child(parent, child):
parent.children.append(child)
def print_tree(node, indent=0):
prefix = "" * (indent - 1) + "└─ " if indent > 0 else ""
print(f"{prefix}{node.title}")
if node.content:
content_prefix = "" * indent + "├─ [内容]"
print(content_prefix)
for line in node.content.split('\n'):
print("" * indent + "" + line)
for child in node.children:
print_tree(child, indent + 1)
def parse_markdown(markdown):
lines = markdown.split('\n')
root = MarkdownNode()
stack = [root]
in_code_block = False
for line in lines:
if line.strip() == "":
continue
# 检测代码块开始/结束
if line.strip().startswith("```") or line.strip().startswith("~~~"):
in_code_block = not in_code_block
continue
# 如果当前在代码块中,直接作为内容处理
if in_code_block:
if stack[-1].content:
stack[-1].content += '\n'
stack[-1].content += line
continue
# 处理标题
match = re.match(r'^(#+)\s*(.*)', line)
if match:
level = len(match.group(1))
title = match.group(2)
node = MarkdownNode(level=level, title=title, content="", children=[])
while stack[-1].level >= level:
stack.pop()
add_child(stack[-1], node)
stack.append(node)
else:
if stack[-1].content:
stack[-1].content += '\n'
stack[-1].content += line
return root
if __name__=="__main__":
# # 从文件读取 Markdown 内容
# with open("workdir/example.md", "r", encoding="utf-8") as f:
# markdown = f.read()
# # 解析 Markdown 并打印树结构
# root = parse_markdown(markdown)
# print_tree(root)
for i in process_markdown_file("workdir/example.md"):
print("~"*20)
print(i)