Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Add OrgReader #940

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llama_hub/file/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
".eml": "UnstructuredReader",
".html": "UnstructuredReader",
".json": "JSONReader",
".org": "OrgReader",
}


Expand Down
19 changes: 19 additions & 0 deletions llama_hub/file/org/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Org Loader

This loader extracts the text from a local Org mode file. A single local file is passed in each time you call `load_data`.

## Usage

To use this loader, you need to pass in a `Path` to a local file. You can split the headings into separated documents by specifying `split_depth` > 1. When `split_depth` is 0, the whole file becomes a single document.

```python
from pathlib import Path
from llama_index import download_loader

OrgReader = download_loader("OrgReader")

loader = OrgReader(split_depth=1)
documents = loader.load_data(file=Path('./inbox.org))
```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
6 changes: 6 additions & 0 deletions llama_hub/file/org/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Init file."""
from llama_hub.file.org.base import OrgReader

__all__ = [
"OrgReader",
]
69 changes: 69 additions & 0 deletions llama_hub/file/org/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Org Reader.

A parser for org files from OrgMode (Emacs).

"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, cast
from copy import deepcopy
from functools import cache

from llama_index.readers.base import BaseReader
from llama_index.schema import Document
from llama_index.bridge.pydantic import BaseModel


@cache
def get_text_from_org_node(current_node, format: str = "plain") -> List[str]:
"""Extract text from org node. Skip properties"""
lines = []
if current_node.heading:
lines.append(current_node.get_heading(format=format))
if current_node.body:
lines.extend(current_node.get_body(format=format).split("\n"))
for child in current_node.children:
lines.extend(get_text_from_org_node(child, format=format))

return lines


class OrgReader(BaseReader, BaseModel):
"""OrgReader

Extract text from org files.
Add the :PROPERTIES: on text node as extra_info
"""

split_depth: int = 0
text_formatting: str = "plain" # plain or raw, as supported by orgparse

def node_to_document(self, node, extra_info: Optional[Dict] = None) -> Document:
"""Convert org node to document."""
text = "\n".join(get_text_from_org_node(node, format=self.text_formatting))
extra_info = deepcopy(extra_info or {})
for prop, value in node.properties.items():
extra_info["org_property_" + prop] = value
return Document(text=text, extra_info=extra_info)

def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file into different documents based on root depth."""
from orgparse import OrgNode, load

org_content: OrgNode = load(file)
documents: List[Document] = []

extra_info = extra_info or {}
extra_info["filename"] = org_content.env.filename

# In orgparse, list(org_content) ALL the nodes in the file
# So we use this to process the nodes below the split_depth as
# separate documents and skip the rest. This means at a split_depth
# of 2, we make documents from nodes at levels 0 (whole file), 1, and 2.
# The text will be present in multiple documents!
for node in list(org_content):
if node.level <= self.split_depth:
documents.append(self.node_to_document(node, extra_info))

return documents
1 change: 1 addition & 0 deletions llama_hub/file/org/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
orgparse==0.4.20231004
5 changes: 5 additions & 0 deletions llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -866,5 +866,10 @@
"Capella",
"NoSQL"
]
},
"OrgReader": {
"id": "file/org",
"author": "RomainGehrig",
"keywords": ["org", "org-mode", "emacs"]
}
}
Loading