Markdown List Splitter
To pre-process my LogSeq database for RAG:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Markdown List Splitter | |
This script parses a Markdown file containing lists (ordered or unordered) | |
and extracts the content of the leaf nodes from the list structure. It then | |
generates hierarchical, indented chunks based on the list nesting and prints | |
them to the console. | |
### Features: | |
- **Markdown Parsing**: Uses `markdown-it-py` to tokenize the markdown content. | |
- **List Node Extraction**: Traverses the syntax tree to extract the deepest list items (leaf nodes). | |
- **Formatted Output**: Generates hierarchical chunks from the list items for easy readability. | |
### Use Case: | |
- Ideal for processing and extracting structured data from bullet point lists in Markdown documents. | |
- Can be adapted for processing outlines, summaries, or any list-based data in Markdown files. | |
- Depending on the content, chunks should be useful for Retrieval Augmented Generation (RAG). | |
- Typical input are LogSeq pages. | |
### Requirements: | |
- `markdown-it-py` | |
- `mdformat` | |
### Example: | |
Given a Markdown file with nested lists, the script will print indented, numbered or bulleted lists to the console. | |
Usage: | |
1. Specify the path to your Markdown file within the script (default is `example_data/Agreement on a Unified Patent Court.md`). | |
2. Run the script to see the extracted and formatted list content. | |
""" | |
from markdown_it import MarkdownIt | |
from markdown_it.tree import SyntaxTreeNode | |
from mdformat.renderer import MDRenderer | |
def load_markdown(filename): | |
with open(filename, 'r') as f: | |
markdown_text = f.read() | |
md = MarkdownIt() | |
tokens = md.parse(markdown_text) | |
return tokens | |
def render(node): | |
renderer = MDRenderer() | |
options = {} | |
env = {} | |
return renderer.render(node.to_tokens(), options, env) | |
def find_leaf_nodes(node, path, leaf_paths): | |
if node.type == 'bullet_list' or node.type == 'ordered_list': | |
for child in node.children: | |
find_leaf_nodes(child, path, leaf_paths) | |
elif node.type == 'list_item': | |
item_content = '' | |
sublist = None | |
for child in node.children: | |
if child.type == 'bullet_list' or child.type == 'ordered_list': | |
sublist = child | |
else: | |
item_content += render(child) | |
item_content = item_content.strip() | |
if item_content: | |
path.append(item_content) | |
if sublist: | |
find_leaf_nodes(sublist, path, leaf_paths) | |
else: | |
leaf_paths.append(list(path)) | |
path.pop() | |
def generate_chunk(path): | |
lines = [] | |
for indent_level, content in enumerate(path): | |
indent_spaces = ' ' * indent_level | |
marker = '- ' | |
line = indent_spaces + marker + content | |
lines.append(line) | |
return '\n'.join(lines) | |
def main(path): | |
import os | |
basename = os.path.basename(path) | |
filename, _ = os.path.splitext(basename) | |
tokens = load_markdown(path) | |
root = SyntaxTreeNode(tokens) | |
leaf_paths = [] | |
for node in root.children: | |
if node.type == 'bullet_list' or node.type == 'ordered_list': | |
find_leaf_nodes(node, [filename], leaf_paths) | |
for i, path in enumerate(leaf_paths[:20]): | |
chunk = generate_chunk(path) | |
print(f"Chunk {i+1}:\n{chunk}\n") | |
if __name__ == '__main__': | |
import sys | |
if len(sys.argv) != 2: | |
print("Usage: python script_name.py <path_to_markdown_file>") | |
sys.exit(1) | |
markdown_file = sys.argv[1] | |
main(markdown_file) |