diff --git a/llama_hub/file/base.py b/llama_hub/file/base.py index c91fc6c41d..d373edc118 100644 --- a/llama_hub/file/base.py +++ b/llama_hub/file/base.py @@ -25,6 +25,7 @@ ".eml": "UnstructuredReader", ".html": "UnstructuredReader", ".json": "JSONReader", + ".org": "OrgReader", } diff --git a/llama_hub/file/org/README.md b/llama_hub/file/org/README.md new file mode 100644 index 0000000000..08e939565e --- /dev/null +++ b/llama_hub/file/org/README.md @@ -0,0 +1,19 @@ +# Org Loader + +This loader extracts the text from a local Org mode file. A single local file is passed in each time you call `load_data`. + +## Usage + +To use this loader, you need to pass in a `Path` to a local file. You can split the headings into separated documents by specifying `split_depth` > 1. When `split_depth` is 0, the whole file becomes a single document. + +```python +from pathlib import Path +from llama_index import download_loader + +OrgReader = download_loader("OrgReader") + +loader = OrgReader(split_depth=1) +documents = loader.load_data(file=Path('./inbox.org)) +``` + +This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/llama_hub/file/org/__init__.py b/llama_hub/file/org/__init__.py new file mode 100644 index 0000000000..a66c0115e3 --- /dev/null +++ b/llama_hub/file/org/__init__.py @@ -0,0 +1,6 @@ +"""Init file.""" +from llama_hub.file.org.base import OrgReader + +__all__ = [ + "OrgReader", +] diff --git a/llama_hub/file/org/base.py b/llama_hub/file/org/base.py new file mode 100644 index 0000000000..7d3b2b4451 --- /dev/null +++ b/llama_hub/file/org/base.py @@ -0,0 +1,69 @@ +"""Org Reader. + +A parser for org files from OrgMode (Emacs). + +""" +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast +from copy import deepcopy +from functools import cache + +from llama_index.readers.base import BaseReader +from llama_index.schema import Document +from llama_index.bridge.pydantic import BaseModel + + +@cache +def get_text_from_org_node(current_node, format: str = "plain") -> List[str]: + """Extract text from org node. Skip properties""" + lines = [] + if current_node.heading: + lines.append(current_node.get_heading(format=format)) + if current_node.body: + lines.extend(current_node.get_body(format=format).split("\n")) + for child in current_node.children: + lines.extend(get_text_from_org_node(child, format=format)) + + return lines + + +class OrgReader(BaseReader, BaseModel): + """OrgReader + + Extract text from org files. + Add the :PROPERTIES: on text node as extra_info + """ + + split_depth: int = 0 + text_formatting: str = "plain" # plain or raw, as supported by orgparse + + def node_to_document(self, node, extra_info: Optional[Dict] = None) -> Document: + """Convert org node to document.""" + text = "\n".join(get_text_from_org_node(node, format=self.text_formatting)) + extra_info = deepcopy(extra_info or {}) + for prop, value in node.properties.items(): + extra_info["org_property_" + prop] = value + return Document(text=text, extra_info=extra_info) + + def load_data( + self, file: Path, extra_info: Optional[Dict] = None + ) -> List[Document]: + """Parse file into different documents based on root depth.""" + from orgparse import OrgNode, load + + org_content: OrgNode = load(file) + documents: List[Document] = [] + + extra_info = extra_info or {} + extra_info["filename"] = org_content.env.filename + + # In orgparse, list(org_content) ALL the nodes in the file + # So we use this to process the nodes below the split_depth as + # separate documents and skip the rest. This means at a split_depth + # of 2, we make documents from nodes at levels 0 (whole file), 1, and 2. + # The text will be present in multiple documents! + for node in list(org_content): + if node.level <= self.split_depth: + documents.append(self.node_to_document(node, extra_info)) + + return documents diff --git a/llama_hub/file/org/requirements.txt b/llama_hub/file/org/requirements.txt new file mode 100644 index 0000000000..fd5fe67693 --- /dev/null +++ b/llama_hub/file/org/requirements.txt @@ -0,0 +1 @@ +orgparse==0.4.20231004 diff --git a/llama_hub/library.json b/llama_hub/library.json index 49f4e12030..2a5c7a4b8c 100644 --- a/llama_hub/library.json +++ b/llama_hub/library.json @@ -866,5 +866,10 @@ "Capella", "NoSQL" ] + }, + "OrgReader": { + "id": "file/org", + "author": "RomainGehrig", + "keywords": ["org", "org-mode", "emacs"] } }