Markdown List Splitter

To pre-process my LogSeq database for RAG:

"""
Markdown List Splitter
This script parses a Markdown file containing lists (ordered or unordered)
and extracts the content of the leaf nodes from the list structure. It then
generates hierarchical, indented chunks based on the list nesting and prints
them to the console.
### Features:
- **Markdown Parsing**: Uses `markdown-it-py` to tokenize the markdown content.
- **List Node Extraction**: Traverses the syntax tree to extract the deepest list items (leaf nodes).
- **Formatted Output**: Generates hierarchical chunks from the list items for easy readability.
### Use Case:
- Ideal for processing and extracting structured data from bullet point lists in Markdown documents.
- Can be adapted for processing outlines, summaries, or any list-based data in Markdown files.
- Depending on the content, chunks should be useful for Retrieval Augmented Generation (RAG).
- Typical input are LogSeq pages.
### Requirements:
- `markdown-it-py`
- `mdformat`
### Example:
Given a Markdown file with nested lists, the script will print indented, numbered or bulleted lists to the console.
Usage:
1. Specify the path to your Markdown file within the script (default is `example_data/Agreement on a Unified Patent Court.md`).
2. Run the script to see the extracted and formatted list content.
"""
from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
from mdformat.renderer import MDRenderer
def load_markdown(filename):
with open(filename, 'r') as f:
markdown_text = f.read()
md = MarkdownIt()
tokens = md.parse(markdown_text)
return tokens
def render(node):
renderer = MDRenderer()
options = {}
env = {}
return renderer.render(node.to_tokens(), options, env)
def find_leaf_nodes(node, path, leaf_paths):
if node.type == 'bullet_list' or node.type == 'ordered_list':
for child in node.children:
find_leaf_nodes(child, path, leaf_paths)
elif node.type == 'list_item':
item_content = ''
sublist = None
for child in node.children:
if child.type == 'bullet_list' or child.type == 'ordered_list':
sublist = child
else:
item_content += render(child)
item_content = item_content.strip()
if item_content:
path.append(item_content)
if sublist:
find_leaf_nodes(sublist, path, leaf_paths)
else:
leaf_paths.append(list(path))
path.pop()
def generate_chunk(path):
lines = []
for indent_level, content in enumerate(path):
indent_spaces = ' ' * indent_level
marker = '- '
line = indent_spaces + marker + content
lines.append(line)
return '\n'.join(lines)
def main(path):
import os
basename = os.path.basename(path)
filename, _ = os.path.splitext(basename)
tokens = load_markdown(path)
root = SyntaxTreeNode(tokens)
leaf_paths = []
for node in root.children:
if node.type == 'bullet_list' or node.type == 'ordered_list':
find_leaf_nodes(node, [filename], leaf_paths)
for i, path in enumerate(leaf_paths[:20]):
chunk = generate_chunk(path)
print(f"Chunk {i+1}:\n{chunk}\n")
if __name__ == '__main__':
import sys
if len(sys.argv) != 2:
print("Usage: python script_name.py <path_to_markdown_file>")
sys.exit(1)
markdown_file = sys.argv[1]
main(markdown_file)