
- 新增 dataset.py 文件,定义数据集相关模型 - 新增 tools 目录,包含解析 Markdown 和扫描文档的功能 - 修改 parse_markdown.py,增加处理 Markdown 文件的函数 - 新增 scan_doc_dir.py,实现文档目录扫描功能
93 lines
2.8 KiB
Python
93 lines
2.8 KiB
Python
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# 添加项目根目录到sys.path
|
|
sys.path.append(str(Path(__file__).resolve().parent.parent))
|
|
from schema import MarkdownNode
|
|
|
|
def process_markdown_file(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
root = parse_markdown(content)
|
|
results = []
|
|
|
|
def traverse(node, parent_titles):
|
|
current_titles = parent_titles.copy()
|
|
current_titles.append(node.title)
|
|
|
|
if not node.children: # 叶子节点
|
|
if node.content:
|
|
full_text = ' -> '.join(current_titles) + '\n' + node.content
|
|
results.append(full_text)
|
|
else:
|
|
for child in node.children:
|
|
traverse(child, current_titles)
|
|
|
|
traverse(root, [])
|
|
return results
|
|
def add_child(parent, child):
|
|
parent.children.append(child)
|
|
|
|
def print_tree(node, indent=0):
|
|
prefix = "│ " * (indent - 1) + "└─ " if indent > 0 else ""
|
|
print(f"{prefix}{node.title}")
|
|
if node.content:
|
|
content_prefix = "│ " * indent + "├─ [内容]"
|
|
print(content_prefix)
|
|
for line in node.content.split('\n'):
|
|
print("│ " * indent + "│ " + line)
|
|
for child in node.children:
|
|
print_tree(child, indent + 1)
|
|
|
|
def parse_markdown(markdown):
|
|
lines = markdown.split('\n')
|
|
root = MarkdownNode()
|
|
stack = [root]
|
|
|
|
in_code_block = False
|
|
for line in lines:
|
|
if line.strip() == "":
|
|
continue
|
|
|
|
# 检测代码块开始/结束
|
|
if line.strip().startswith("```") or line.strip().startswith("~~~"):
|
|
in_code_block = not in_code_block
|
|
continue
|
|
|
|
# 如果当前在代码块中,直接作为内容处理
|
|
if in_code_block:
|
|
if stack[-1].content:
|
|
stack[-1].content += '\n'
|
|
stack[-1].content += line
|
|
continue
|
|
|
|
# 处理标题
|
|
match = re.match(r'^(#+)\s*(.*)', line)
|
|
if match:
|
|
level = len(match.group(1))
|
|
title = match.group(2)
|
|
node = MarkdownNode(level=level, title=title, content="", children=[])
|
|
while stack[-1].level >= level:
|
|
stack.pop()
|
|
add_child(stack[-1], node)
|
|
stack.append(node)
|
|
else:
|
|
if stack[-1].content:
|
|
stack[-1].content += '\n'
|
|
stack[-1].content += line
|
|
|
|
return root
|
|
|
|
if __name__=="__main__":
|
|
# # 从文件读取 Markdown 内容
|
|
# with open("workdir/example.md", "r", encoding="utf-8") as f:
|
|
# markdown = f.read()
|
|
|
|
# # 解析 Markdown 并打印树结构
|
|
# root = parse_markdown(markdown)
|
|
# print_tree(root)
|
|
for i in process_markdown_file("workdir/example.md"):
|
|
print("~"*20)
|
|
print(i) |