run-llama · RomainGehrig · Feb 9, 2024 · Feb 9, 2024
diff --git a/llama_hub/file/base.py b/llama_hub/file/base.py
@@ -25,6 +25,7 @@
     ".eml": "UnstructuredReader",
     ".html": "UnstructuredReader",
     ".json": "JSONReader",
+    ".org": "OrgReader",
 }
 
 

diff --git a/llama_hub/file/org/README.md b/llama_hub/file/org/README.md
@@ -0,0 +1,19 @@
+# Org Loader
+
+This loader extracts the text from a local Org mode file. A single local file is passed in each time you call `load_data`.
+
+## Usage
+
+To use this loader, you need to pass in a `Path` to a local file. You can split the headings into separated documents by specifying `split_depth` > 1. When `split_depth` is 0, the whole file becomes a single document.
+
+```python
+from pathlib import Path
+from llama_index import download_loader
+
+OrgReader = download_loader("OrgReader")
+
+loader = OrgReader(split_depth=1)
+documents = loader.load_data(file=Path('./inbox.org))
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
diff --git a/llama_hub/file/org/__init__.py b/llama_hub/file/org/__init__.py
@@ -0,0 +1,6 @@
+"""Init file."""
+from llama_hub.file.org.base import OrgReader
+
+__all__ = [
+    "OrgReader",
+]
diff --git a/llama_hub/file/org/base.py b/llama_hub/file/org/base.py
@@ -0,0 +1,69 @@
+"""Org Reader.
+
+A parser for org files from OrgMode (Emacs).
+
+"""
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, cast
+from copy import deepcopy
+from functools import cache
+
+from llama_index.readers.base import BaseReader
+from llama_index.schema import Document
+from llama_index.bridge.pydantic import BaseModel
+
+
+@cache
+def get_text_from_org_node(current_node, format: str = "plain") -> List[str]:
+    """Extract text from org node. Skip properties"""
+    lines = []
+    if current_node.heading:
+        lines.append(current_node.get_heading(format=format))
+    if current_node.body:
+        lines.extend(current_node.get_body(format=format).split("\n"))
+    for child in current_node.children:
+        lines.extend(get_text_from_org_node(child, format=format))
+
+    return lines
+
+
+class OrgReader(BaseReader, BaseModel):
+    """OrgReader
+
+    Extract text from org files.
+    Add the :PROPERTIES: on text node as extra_info
+    """
+
+    split_depth: int = 0
+    text_formatting: str = "plain"  # plain or raw, as supported by orgparse
+
+    def node_to_document(self, node, extra_info: Optional[Dict] = None) -> Document:
+        """Convert org node to document."""
+        text = "\n".join(get_text_from_org_node(node, format=self.text_formatting))
+        extra_info = deepcopy(extra_info or {})
+        for prop, value in node.properties.items():
+            extra_info["org_property_" + prop] = value
+        return Document(text=text, extra_info=extra_info)
+
+    def load_data(
+        self, file: Path, extra_info: Optional[Dict] = None
+    ) -> List[Document]:
+        """Parse file into different documents based on root depth."""
+        from orgparse import OrgNode, load
+
+        org_content: OrgNode = load(file)
+        documents: List[Document] = []
+
+        extra_info = extra_info or {}
+        extra_info["filename"] = org_content.env.filename
+
+        # In orgparse, list(org_content) ALL the nodes in the file
+        # So we use this to process the nodes below the split_depth as
+        # separate documents and skip the rest. This means at a split_depth
+        # of 2, we make documents from nodes at levels 0 (whole file), 1, and 2.
+        # The text will be present in multiple documents!
+        for node in list(org_content):
+            if node.level <= self.split_depth:
+                documents.append(self.node_to_document(node, extra_info))
+
+        return documents
diff --git a/llama_hub/file/org/requirements.txt b/llama_hub/file/org/requirements.txt
@@ -0,0 +1 @@
+orgparse==0.4.20231004
diff --git a/llama_hub/library.json b/llama_hub/library.json
@@ -866,5 +866,10 @@
       "Capella",
       "NoSQL"
     ]
+  },
+  "OrgReader": {
+    "id": "file/org",
+    "author": "RomainGehrig",
+    "keywords": ["org", "org-mode", "emacs"]
   }
 }