diff --git a/sdk/python/generative-ai/rag/code_first/README.md b/sdk/python/generative-ai/rag/code_first/README.md
index a291b96d56..6b4c8a0fc8 100644
--- a/sdk/python/generative-ai/rag/code_first/README.md
+++ b/sdk/python/generative-ai/rag/code_first/README.md
@@ -6,8 +6,8 @@ Read more about their structure [here](./docs/mlindex.md).
## Pre-requisites
0. Install `azure-ai-ml` and `azureml-rag`:
- - `pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/`
- - `pip install -U 'azureml-rag[document_parsing,faiss,cognitive_search]>=0.2.0'`
+ - `pip install 'azure-ai-ml>=1.10'`
+ - `pip install 'azureml-rag[document_parsing,faiss,cognitive_search]>=0.2.0'`
1. You have unstructured data.
- In one of [AzureMLs supported data sources](https://learn.microsoft.com/azure/machine-learning/concept-data?view=azureml-api-2): Blob, ADLSgen2, OneLake, S3, Git
- In any of these supported file formats: md, txt, py, pdf, ppt(x), doc(x)
diff --git a/sdk/python/generative-ai/rag/code_first/data_index_job/cog_search_docs_faiss_mlindex.py b/sdk/python/generative-ai/rag/code_first/data_index_job/cog_search_docs_faiss_mlindex.py
index a6bae38fde..f6ed03d57d 100644
--- a/sdk/python/generative-ai/rag/code_first/data_index_job/cog_search_docs_faiss_mlindex.py
+++ b/sdk/python/generative-ai/rag/code_first/data_index_job/cog_search_docs_faiss_mlindex.py
@@ -2,8 +2,6 @@
# # Local Documents to Azure Cognitive Search Index
# %% Prerequisites
-# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
-# %pip install 'azureml-rag[faiss]>=0.2.0'
# %pip install 'promptflow[azure]' promptflow-tools promptflow-vectordb
# %% Authenticate to you AzureML Workspace, download a `config.json` from the top right hand corner menu of the Workspace.
diff --git a/sdk/python/generative-ai/rag/code_first/data_index_job/local_docs_to_acs_mlindex.py b/sdk/python/generative-ai/rag/code_first/data_index_job/local_docs_to_acs_mlindex.py
index 8dd05f64d1..ef5d34e2bf 100644
--- a/sdk/python/generative-ai/rag/code_first/data_index_job/local_docs_to_acs_mlindex.py
+++ b/sdk/python/generative-ai/rag/code_first/data_index_job/local_docs_to_acs_mlindex.py
@@ -1,11 +1,8 @@
# %%[markdown]
# # Local Documents to Azure Cognitive Search Index
-# %% Prerequisites
-# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
-# %pip install 'azureml-rag[cognitive_search]>=0.2.0'
-
# %% Authenticate to you AzureML Workspace, download a `config.json` from the top right hand corner menu of the Workspace.
+from azureml.rag.dataindex import DataIndex
from azure.ai.ml import MLClient, load_data
from azure.identity import DefaultAzureCredential
diff --git a/sdk/python/generative-ai/rag/code_first/data_index_job/s3_to_acs_mlindex.py b/sdk/python/generative-ai/rag/code_first/data_index_job/s3_to_acs_mlindex.py
index b9e253e5b3..5401564e03 100644
--- a/sdk/python/generative-ai/rag/code_first/data_index_job/s3_to_acs_mlindex.py
+++ b/sdk/python/generative-ai/rag/code_first/data_index_job/s3_to_acs_mlindex.py
@@ -1,10 +1,6 @@
# %%[markdown]
# # S3 via OneLake to Azure Cognitive Search Index
-# %% Prerequisites
-# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
-# %pip install 'azureml-rag[cognitive_search]>=0.2.0'
-
# %% Authenticate to an AzureML Workspace, you can download a `config.json` from the top-right-hand corner menu of a Workspace.
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
diff --git a/sdk/python/generative-ai/rag/code_first/data_index_job/scheduled_s3_to_asc_mlindex.py b/sdk/python/generative-ai/rag/code_first/data_index_job/scheduled_s3_to_asc_mlindex.py
index f30bfa1cb3..f8676ddbe0 100644
--- a/sdk/python/generative-ai/rag/code_first/data_index_job/scheduled_s3_to_asc_mlindex.py
+++ b/sdk/python/generative-ai/rag/code_first/data_index_job/scheduled_s3_to_asc_mlindex.py
@@ -1,10 +1,6 @@
# %%[markdown]
# # S3 via OneLake to Azure Cognitive Search Index
-# %% Prerequisites
-# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
-# %pip install 'azureml-rag[cognitive_search]>=0.2.0'
-
# %% Authenticate to an AzureML Workspace, you can download a `config.json` from the top-right-hand corner menu of a Workspace.
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
diff --git a/sdk/python/generative-ai/rag/code_first/mlindex_local/langchain_docs_to_mlindex.py b/sdk/python/generative-ai/rag/code_first/mlindex_local/langchain_docs_to_mlindex.py
index b3674d5f2c..584d612b74 100644
--- a/sdk/python/generative-ai/rag/code_first/mlindex_local/langchain_docs_to_mlindex.py
+++ b/sdk/python/generative-ai/rag/code_first/mlindex_local/langchain_docs_to_mlindex.py
@@ -2,8 +2,6 @@
# # Build an ACS Index using langchain data loaders and MLIndex SDK
# %% Pre-requisites
-# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
-# %pip install 'azureml-rag[cognitive_search]>=0.2.0'
# %pip install wikipedia
# %% Get Azure Cognitive Search Connection
@@ -33,6 +31,7 @@
# %%
from azureml.rag.mlindex import MLIndex
+mlindex_output_path = "./hunter_x_hunter_aoai_acs"
# Process data into FAISS Index using HuggingFace embeddings
mlindex = MLIndex.from_documents(
documents=split_docs,
@@ -42,9 +41,31 @@
index_type="acs",
index_connection=acs_connection,
index_config={"index_name": "hunter_x_hunter_aoai_acs"},
+ output_path=mlindex_output_path,
)
# %% Query documents, use with inferencing framework
index = mlindex.as_langchain_vectorstore()
docs = index.similarity_search("What is bungie gum?", k=5)
print(docs)
+
+# %% Register local MLIndex as remote asset
+from azure.ai.ml.entities import Data
+
+asset_name = "hunter_x_hunter_aoai_acs_mlindex"
+asset = ml_client.data.create_or_update(
+ Data(
+ name=asset_name,
+ version="1",
+ path=mlindex_output_path,
+ description="MLIndex Documentation Embedded using Azure OpenAI indexed using Azure Cognitive Search.",
+ properties={
+ "azureml.mlIndexAssetKind": "acs",
+ "azureml.mlIndexAsset": "true",
+ "azureml.mlIndexAssetSource": "Local Data",
+ "azureml.mlIndexAssetPipelineRunId": "Local",
+ },
+ )
+)
+
+print(asset)
diff --git a/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_acs_aoai_mlindex.py b/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_acs_aoai_mlindex.py
index 82fe858554..cb3f4d1acc 100644
--- a/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_acs_aoai_mlindex.py
+++ b/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_acs_aoai_mlindex.py
@@ -1,10 +1,6 @@
# %%[markdown]
# # Build an ACS Index using MLIndex SDK
-# %% Pre-requisites
-# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
-# %pip install 'azureml-rag[document_parsing,cognitive_search]>=0.2.0'
-
# %% Get Azure Cognitive Search Connection
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
@@ -17,6 +13,7 @@
# %%
from azureml.rag.mlindex import MLIndex
+mlindex_output_path = "./acs_open_ai_index"
# Process data into FAISS Index using HuggingFace embeddings
mlindex = MLIndex.from_files(
source_uri="../",
@@ -28,15 +25,39 @@
index_type="acs",
index_connection=acs_connection,
index_config={"index_name": "mlindex_docs_aoai_acs"},
- output_path="./acs_open_ai_index",
+ output_path=mlindex_output_path,
)
# %% Load MLIndex from local
from azureml.rag.mlindex import MLIndex
-mlindex = MLIndex("./acs_open_ai_index")
+mlindex = MLIndex(mlindex_output_path)
# %% Query documents, use with inferencing framework
index = mlindex.as_langchain_vectorstore()
docs = index.similarity_search("Topic in my data.", k=5)
print(docs)
+
+# %% Register local MLIndex as remote asset
+from azure.ai.ml.entities import Data
+
+# TODO: MLIndex should help registering FAISS as asset with all the properties.
+asset_name = "mlindex_docs_aoai_acs_mlindex"
+asset = ml_client.data.create_or_update(
+ Data(
+ name=asset_name,
+ version="1",
+ path=mlindex_output_path,
+ description="MLIndex Documentation Embedded using Azure OpenAI indexed using Azure Cognitive Search.",
+ properties={
+ "azureml.mlIndexAssetKind": "acs",
+ "azureml.mlIndexAsset": "true",
+ "azureml.mlIndexAssetSource": "Local Data",
+ "azureml.mlIndexAssetPipelineRunId": "Local",
+ },
+ )
+)
+
+print(asset)
+
+# %%
diff --git a/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_faiss_mlindex.py b/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_faiss_mlindex.py
index fbde424b96..5762d7c89a 100644
--- a/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_faiss_mlindex.py
+++ b/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_faiss_mlindex.py
@@ -1,10 +1,6 @@
# %%[markdown]
# # Build a Faiss Index using MLIndex SDK
-# %% Pre-requisites
-# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
-# %pip install 'azureml-rag[document_parsing,faiss,hugging_face]>=0.2.0'
-
# %%
from azureml.rag.mlindex import MLIndex
@@ -13,7 +9,6 @@
source_uri="../",
source_glob="**/*",
chunk_size=200,
- # embeddings_model=sentence_transformers.SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
embeddings_model="hugging_face://model/sentence-transformers/all-mpnet-base-v2",
embeddings_container="./.embeddings_cache/mlindex_docs_mpnet_faiss",
index_type="faiss",
diff --git a/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_faiss_mlindex_with_promptflow.py b/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_faiss_mlindex_with_promptflow.py
index a4c01631eb..1b0355d5ab 100644
--- a/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_faiss_mlindex_with_promptflow.py
+++ b/sdk/python/generative-ai/rag/code_first/mlindex_local/local_docs_to_faiss_mlindex_with_promptflow.py
@@ -1,6 +1,7 @@
+# %%[markdown]
+# # Build a Faiss Index using MLIndex SDK and use it in Promptflow
+
# %% Pre-requisites
-# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
-# %pip install 'azureml-rag[document_parsing,faiss]>=0.2.0'
# %pip install -U 'promptflow[azure]' promptflow-tools promptflow-vectordb
# %% Get Azure Cognitive Search Connection
diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/assets/custom_doc_intel_connection.png b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/assets/custom_doc_intel_connection.png
new file mode 100644
index 0000000000..330a5b2f54
Binary files /dev/null and b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/assets/custom_doc_intel_connection.png differ
diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/crack_and_chunk_with_doc_intel/crack_and_chunk.py b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/crack_and_chunk_with_doc_intel/crack_and_chunk.py
new file mode 100644
index 0000000000..f89716f602
--- /dev/null
+++ b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/crack_and_chunk_with_doc_intel/crack_and_chunk.py
@@ -0,0 +1,86 @@
+import os
+import traceback
+
+from azureml.rag.documents.cracking import file_extension_loaders
+from azureml.rag.tasks.crack_and_chunk import (
+ __main__,
+ crack_and_chunk_arg_parser,
+ str2bool,
+)
+from azureml.rag.tasks.crack_and_chunk import main as main_crack_and_chunk
+from azureml.rag.utils.connections import get_connection_by_id_v2
+from azureml.rag.utils.logging import (
+ get_logger,
+ safe_mlflow_start_run,
+ track_activity,
+)
+
+from azure.ai.formrecognizer import DocumentAnalysisClient
+from azure.core.credentials import AzureKeyCredential
+from document_intelligence_loader import DocumentIntelligencePDFLoader
+
+logger = get_logger("crack_and_chunk_document_intelligence")
+
+
+def main(args, logger, activity_logger):
+ if args.doc_intel_connection_id:
+ document_intelligence_connection = get_connection_by_id_v2(
+ args.doc_intel_connection_id
+ )
+
+ os.environ["DOCUMENT_INTELLIGENCE_ENDPOINT"] = document_intelligence_connection[
+ "properties"
+ ]["metadata"]["endpoint"]
+ os.environ["DOCUMENT_INTELLIGENCE_KEY"] = document_intelligence_connection[
+ "properties"
+ ]["credentials"]["keys"]["api_key"]
+ os.environ["AZURE_AI_DOCUMENT_INTELLIGENCE_USE_LAYOUT"] = str(args.use_layout)
+
+ DocumentIntelligencePDFLoader.document_intelligence_client = (
+ DocumentAnalysisClient(
+ endpoint=document_intelligence_connection["properties"]["metadata"][
+ "endpoint"
+ ],
+ credential=AzureKeyCredential(
+ document_intelligence_connection["properties"]["credentials"][
+ "keys"
+ ]["api_key"]
+ ),
+ )
+ )
+ DocumentIntelligencePDFLoader.use_layout = args.use_layout
+ else:
+ raise ValueError("doc_intel_connection_id is required")
+
+ # Override default `.pdf` loader to use Azure AI Document Intelligence
+ file_extension_loaders[".pdf"] = DocumentIntelligencePDFLoader
+
+ main_crack_and_chunk(args, logger, activity_logger)
+
+
+def main_wrapper(args, logger):
+ with track_activity(
+ logger, "crack_and_chunk_document_intelligence"
+ ) as activity_logger, safe_mlflow_start_run(logger=logger):
+ try:
+ main(args, logger, activity_logger)
+ except Exception:
+ activity_logger.error(
+ f"crack_and_chunk_document_intelligence failed with exception: {traceback.format_exc()}"
+ )
+ raise
+
+
+if __name__ == "__main__":
+ parser = crack_and_chunk_arg_parser()
+
+ parser.add_argument(
+ "--doc_intel_connection_id",
+ type=str,
+ help="Custom Connection to use for Document Intelligence",
+ )
+ parser.add_argument(
+ "--use_layout", type=str2bool, default=False, help="Use layout for PDF cracking"
+ )
+
+ __main__(parser, main_wrapper)
diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/crack_and_chunk_with_doc_intel/document_intelligence_loader.py b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/crack_and_chunk_with_doc_intel/document_intelligence_loader.py
new file mode 100644
index 0000000000..9107446f2a
--- /dev/null
+++ b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/crack_and_chunk_with_doc_intel/document_intelligence_loader.py
@@ -0,0 +1,244 @@
+import os
+import re
+from pathlib import Path
+from typing import IO, List
+
+from azure.ai.formrecognizer import DocumentAnalysisClient
+from azure.core.credentials import AzureKeyCredential
+from azureml.rag.documents import (
+ ChunkedDocument,
+ Document,
+ DocumentSource,
+ StaticDocument,
+)
+from azureml.rag.documents.cracking import BaseDocumentLoader
+from azureml.rag.utils.logging import get_logger
+
+logger = get_logger("document_intelligence_loader")
+
+
+class SingletonDocumentIntelligenceClient:
+ """Singleton class for FormRecognizerClient."""
+
+ instance = None
+ url = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
+ key = os.getenv("DOCUMENT_INTELLIGENCE_KEY")
+
+ def __new__(cls, *args, **kwargs):
+ if not cls.instance:
+ logger.info(
+ "SingletonFormRecognizerClient: Creating instance of Form recognizer per process"
+ )
+ if cls.url and cls.key:
+ cls.instance = DocumentAnalysisClient(
+ endpoint=cls.url, credential=AzureKeyCredential(cls.key)
+ )
+ else:
+ logger.info(
+ "SingletonFormRecognizerClient: Skipping since credentials not provided. Assuming NO form recognizer extensions(like .pdf) in directory"
+ )
+ cls.instance = object() # dummy object
+ return cls.instance
+
+ def __getstate__(self):
+ return self.url, self.key
+
+ def __setstate__(self, state):
+ url, key = state
+ self.instance = DocumentAnalysisClient(
+ endpoint=url, credential=AzureKeyCredential(key)
+ )
+
+
+class DocumentIntelligencePDFLoader(BaseDocumentLoader):
+ """Load PDF files."""
+
+ document_intelligence_client = None
+ use_layout = None
+
+ def __init__(
+ self, file: IO, document_source: DocumentSource, metadata: dict = None
+ ):
+ """Initialize a PDF loader."""
+ if file.mode == "r":
+ file = file.buffer
+ super().__init__(file, document_source, metadata=metadata)
+
+ if DocumentIntelligencePDFLoader.document_intelligence_client is None:
+ DocumentIntelligencePDFLoader.document_intelligence_client = (
+ SingletonDocumentIntelligenceClient()
+ )
+
+ if self.use_layout is None:
+ self.use_layout = (
+ os.environ.get(
+ "AZURE_AI_DOCUMENT_INTELLIGENCE_USE_LAYOUT", "false"
+ ).lower()
+ == "true"
+ )
+ logger.info(f"{self.use_layout = }")
+
+ def load_chunked_document(self) -> ChunkedDocument:
+ """Load file contents into ChunkedDocument."""
+ pages = self.load()
+ chunk_prefix = f"Title: {Path(self.document_source.filename).name}"
+ document_source = self.document_source
+ if self.use_layout:
+ # use_layout=True means the pdf structure has been converted to html, so we change the document_source
+ # filename suffix to html so the html chunker is used.
+ document_source = DocumentSource(
+ path=self.document_source.path.with_suffix(".html"),
+ filename=self.document_source.filename,
+ url=self.document_source.url,
+ mtime=self.document_source.mtime,
+ )
+ return ChunkedDocument(
+ chunks=pages,
+ source=document_source,
+ metadata={**self.metadata, "chunk_prefix": chunk_prefix},
+ )
+
+ def load(self) -> List[Document]:
+ """Load file contents into Document(s)."""
+ import copy
+
+ from azureml.rag.utils import merge_dicts
+
+ page_map = extract_pdf_content(
+ self.file, self.document_intelligence_client, use_layout=self.use_layout
+ )
+ # full_text = "".join([page_text for _, _, page_text in page_map])
+ metadata = copy.deepcopy(self.metadata)
+ return [
+ StaticDocument(
+ cleanup_content(page_text) if self.use_layout else page_text,
+ metadata=merge_dicts(metadata, {"source": {"page_number": page_num}}),
+ )
+ for page_num, _, page_text in page_map
+ ]
+
+ @classmethod
+ def file_io_mode(self) -> str:
+ """Return the file io mode."""
+ return "rb"
+
+ def file_extensions(self) -> List[str]:
+ """Return the file extensions of the file types to be loaded."""
+ return [".pdf"]
+
+
+PDF_HEADERS = {"title": "h1", "sectionHeading": "h2"}
+
+
+def table_to_html(table):
+ import html
+
+ table_html = "
"
+ rows = [
+ sorted(
+ [cell for cell in table.cells if cell.row_index == i],
+ key=lambda cell: cell.column_index,
+ )
+ for i in range(table.row_count)
+ ]
+ for row_cells in rows:
+ table_html += ""
+ for cell in row_cells:
+ tag = (
+ "th"
+ if (cell.kind == "columnHeader" or cell.kind == "rowHeader")
+ else "td"
+ )
+ cell_spans = ""
+ if cell.column_span > 1:
+ cell_spans += f" colSpan={cell.column_span}"
+ if cell.row_span > 1:
+ cell_spans += f" rowSpan={cell.row_span}"
+ table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}{tag}>"
+ table_html += "
"
+ table_html += "
"
+ return table_html
+
+
+def extract_pdf_content(file, form_recognizer_client, use_layout=False):
+ offset = 0
+ page_map = []
+ model = "prebuilt-layout" if use_layout else "prebuilt-read"
+ poller = form_recognizer_client.begin_analyze_document(model, document=file)
+ form_recognizer_results = poller.result()
+
+ # (if using layout) mark all the positions of headers
+ roles_start = {}
+ roles_end = {}
+ for paragraph in form_recognizer_results.paragraphs:
+ if paragraph.role is not None:
+ para_start = paragraph.spans[0].offset
+ para_end = paragraph.spans[0].offset + paragraph.spans[0].length
+ roles_start[para_start] = paragraph.role
+ roles_end[para_end] = paragraph.role
+
+ for page_num, page in enumerate(form_recognizer_results.pages):
+ tables_on_page = [
+ table
+ for table in form_recognizer_results.tables
+ if table.bounding_regions[0].page_number == page_num + 1
+ ]
+
+ # (if using layout) mark all positions of the table spans in the page
+ page_offset = page.spans[0].offset
+ page_length = page.spans[0].length
+ table_chars = [-1] * page_length
+ for table_id, table in enumerate(tables_on_page):
+ for span in table.spans:
+ # replace all table spans with "table_id" in table_chars array
+ for i in range(span.length):
+ idx = span.offset - page_offset + i
+ if idx >= 0 and idx < page_length:
+ table_chars[idx] = table_id
+
+ # build page text by replacing charcters in table spans with table html and replace the characters corresponding to headers with html headers, if using layout
+ page_text = ""
+ added_tables = set()
+ for idx, table_id in enumerate(table_chars):
+ if table_id == -1:
+ position = page_offset + idx
+ if position in roles_start:
+ role = roles_start[position]
+ if role in PDF_HEADERS:
+ page_text += f"<{PDF_HEADERS[role]}>"
+ if position in roles_end:
+ role = roles_end[position]
+ if role in PDF_HEADERS:
+ page_text += f"{PDF_HEADERS[role]}>"
+
+ page_text += form_recognizer_results.content[page_offset + idx]
+
+ elif table_id not in added_tables:
+ page_text += table_to_html(tables_on_page[table_id])
+ added_tables.add(table_id)
+
+ page_text += " "
+ page_map.append((page_num, offset, page_text))
+ offset += len(page_text)
+
+ # full_text = "".join([page_text for _, _, page_text in page_map])
+ return page_map
+
+
+def cleanup_content(content: str) -> str:
+ """
+ Cleans up the given content using regexes.
+
+ Args:
+ ----
+ content (str): The content to clean up.
+
+ Returns
+ -------
+ str: The cleaned up content.
+ """
+ output = re.sub(r"\n{2,}", "\n", content)
+ output = re.sub(r"[^\S\n]{2,}", " ", output)
+ output = re.sub(r"-{2,}", "--", output)
+
+ return output.strip()
diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/crack_pdfs_with_azure_document_intelligence.ipynb b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/crack_pdfs_with_azure_document_intelligence.ipynb
new file mode 100644
index 0000000000..c58bc5a159
--- /dev/null
+++ b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/crack_pdfs_with_azure_document_intelligence.ipynb
@@ -0,0 +1,435 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Create an Index with custom cracking and chunking using Azure Document Intelligence\n",
+ "\n",
+ "Create an index with custom cracking and chunking using the Azure Document Intelligence aka Azure Form Recognizer."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%pip install -U azure-ai-ml>=1.10\n",
+ "%pip install -U 'azureml-rag[azure,cognitive_search]>=0.2.2'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.json\n",
+ "{\n",
+ " \"subscription_id\": \"\",\n",
+ " \"resource_group\": \"\",\n",
+ " \"workspace_name\": \"\"\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n",
+ "from azure.ai.ml import MLClient\n",
+ "\n",
+ "try:\n",
+ " credential = DefaultAzureCredential()\n",
+ " # Check if given credential can get token successfully.\n",
+ " credential.get_token(\"https://management.azure.com/.default\")\n",
+ "except Exception as ex:\n",
+ " # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work\n",
+ " credential = InteractiveBrowserCredential()\n",
+ "\n",
+ "ml_client = MLClient.from_config(credential=credential)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Define the `crack_and_chunk_with_doc_intel` Component which can be used in place of the `crack_and_chunk` Component in Vector Index creation Pipelines."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pathlib import Path\n",
+ "from azure.ai.ml import Input, Output, command\n",
+ "from azure.ai.ml.entities import BuildContext, Environment\n",
+ "\n",
+ "llm_rag_embeddings_doc_intel_environment = Environment(\n",
+ " name=\"llm_rag_embeddings_doc_intel\",\n",
+ " description=\"AzureML RAGs base crack_and_chunk environment with azure-ai-formrecognizer installed.\",\n",
+ " build=BuildContext(path=Path.cwd() / \"doc_intel_env\"),\n",
+ ")\n",
+ "\n",
+ "crack_and_chunk_with_doc_intel_component = command(\n",
+ " version=\"0.0.1\",\n",
+ " name=\"crack_and_chunk_with_doc_intel\",\n",
+ " display_name=\"Crack and Chunk Data leveraging Azure AI Document Intelligence for PDFs\",\n",
+ " description=\"\"\"Creates chunks from source data leveraging Azure AI Document Intelligence for PDFs.\n",
+ "\n",
+ " Supported formats: md, txt, html/htm, pdf, ppt(x), doc(x), xls(x), py\"\"\",\n",
+ " inputs={\n",
+ " # Input AzureML Data\n",
+ " \"input_data\": Input(type=\"uri_folder\", mode=\"rw_mount\"),\n",
+ " # Files to handle from source\n",
+ " \"input_glob\": Input(\n",
+ " type=\"string\",\n",
+ " default=\"**/*\",\n",
+ " description=\"Limit files opened from `input_data`, defaults to '**/*'\",\n",
+ " ),\n",
+ " \"allowed_extensions\": Input(\n",
+ " type=\"string\",\n",
+ " optional=True,\n",
+ " description=\"Comma separated list of extensions to include, if not provided the default list of supported extensions will be used. e.g. '.md,.txt,.html,.py,.pdf'\",\n",
+ " ),\n",
+ " # Chunking options\n",
+ " \"chunk_size\": Input(\n",
+ " type=\"integer\",\n",
+ " default=768,\n",
+ " description=\"Maximum number of tokens per chunk.\",\n",
+ " ),\n",
+ " \"chunk_overlap\": Input(\n",
+ " type=\"integer\",\n",
+ " default=0,\n",
+ " description=\"Number of tokens to overlap between chunks.\",\n",
+ " ),\n",
+ " \"use_rcts\": Input(\n",
+ " type=\"boolean\",\n",
+ " default=True,\n",
+ " description=\"Use langchain RecursiveTextSplitter to split chunks.\",\n",
+ " ),\n",
+ " # Augmentation options\n",
+ " \"data_source_url\": Input(\n",
+ " type=\"string\",\n",
+ " optional=True,\n",
+ " description=\"Base URL to join with file paths to create full source file URL for chunk metadata.\",\n",
+ " ),\n",
+ " \"document_path_replacement_regex\": Input(\n",
+ " type=\"string\",\n",
+ " optional=True,\n",
+ " description=\"A JSON string with two fields, 'match_pattern' and 'replacement_pattern' to be used with re.sub on the source url. e.g. '{\\\"match_pattern\\\": \\\"(.*)/articles/(.*)\\\", \\\"replacement_pattern\\\": \\\"\\\\1/\\\\2\\\"}' would remove '/articles' from the middle of the url.\",\n",
+ " ),\n",
+ " \"doc_intel_connection_id\": Input(\n",
+ " type=\"string\",\n",
+ " description=\"AzureML Connection ID for Custom Workspace Connection containing the `endpoint` key and `api_key` secret for an Azure AI Document Intelligence Service.\",\n",
+ " ),\n",
+ " \"use_layout\": Input(\n",
+ " type=\"boolean\",\n",
+ " default=False,\n",
+ " description=\"Use 'prebuilt-layout' model from Azure AI Document Intelligence, more expensive and slower but maintains more structure from original doc.\",\n",
+ " ),\n",
+ " },\n",
+ " outputs={\n",
+ " \"output_chunks\": Output(type=\"uri_folder\"),\n",
+ " },\n",
+ " code=Path.cwd() / \"crack_and_chunk_with_doc_intel\",\n",
+ " command=\"\"\"python crack_and_chunk.py\\\n",
+ " --input_data ${{inputs.input_data}}\\\n",
+ " --input_glob '${{inputs.input_glob}}'\\\n",
+ " $[[--allowed_extensions ${{inputs.allowed_extensions}}]]\\\n",
+ " --output_chunks ${{outputs.output_chunks}}\\\n",
+ " --chunk_size ${{inputs.chunk_size}}\\\n",
+ " --chunk_overlap ${{inputs.chunk_overlap}}\\\n",
+ " --use_rcts ${{inputs.use_rcts}}\\\n",
+ " $[[--data_source_url ${{inputs.data_source_url}}]]\\\n",
+ " $[[--document_path_replacement_regex '${{inputs.document_path_replacement_regex}}']]\\\n",
+ " --doc_intel_connection_id '${{inputs.doc_intel_connection_id}}'\\\n",
+ " --use_layout ${{inputs.use_layout}}\\\n",
+ " \"\"\",\n",
+ " environment=llm_rag_embeddings_doc_intel_environment,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Define pipeline using the custom `crack_and_chunk_with_doc_intel` Component along with the AzureML provided Components to embed and index your data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ml_registry = MLClient(credential=ml_client._credential, registry_name=\"azureml\")\n",
+ "\n",
+ "# Reads input folder of files containing chunks and their metadata as batches, in parallel, and generates embeddings for each chunk. Output format is produced and loaded by `azureml.rag.embeddings.EmbeddingContainer`.\n",
+ "generate_embeddings_component = ml_registry.components.get(\n",
+ " \"llm_rag_generate_embeddings\", label=\"latest\"\n",
+ ")\n",
+ "# Reads an input folder produced by `azureml.rag.embeddings.EmbeddingsContainer.save()` and pushes all documents (chunk, metadata, embedding_vector) into an Azure Cognitive Search index. Writes an MLIndex yaml detailing the index and embeddings model information.\n",
+ "update_acs_index_component = ml_registry.components.get(\n",
+ " \"llm_rag_update_acs_index\", label=\"latest\"\n",
+ ")\n",
+ "# Takes a uri to a storage location where an MLIndex yaml is stored and registers it as an MLIndex Data asset in the AzureML Workspace.\n",
+ "register_mlindex_asset_component = ml_registry.components.get(\n",
+ " \"llm_rag_register_mlindex_asset\", label=\"latest\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from azure.ai.ml import Input, Output\n",
+ "from azure.ai.ml.dsl import pipeline\n",
+ "from azure.ai.ml.entities._job.pipeline._io import PipelineInput\n",
+ "\n",
+ "\n",
+ "def use_automatic_compute(\n",
+ " component, instance_count=1, instance_type=\"Standard_D4as_v4\"\n",
+ "):\n",
+ " component.set_resources(\n",
+ " instance_count=instance_count,\n",
+ " instance_type=instance_type,\n",
+ " properties={\"compute_specification\": {\"automatic\": True}},\n",
+ " )\n",
+ " return component\n",
+ "\n",
+ "\n",
+ "def optional_pipeline_input_provided(input: PipelineInput):\n",
+ " return input._data is not None\n",
+ "\n",
+ "\n",
+ "@pipeline(default_compute=\"serverless\")\n",
+ "def uri_to_acs(\n",
+ " input_data: Input,\n",
+ " doc_intel_connection_id: str,\n",
+ " embeddings_model: str,\n",
+ " acs_config: str,\n",
+ " acs_connection_id: str,\n",
+ " asset_name: str,\n",
+ " chunk_size: int = 512,\n",
+ " data_source_glob: str = None,\n",
+ " data_source_url: str = None,\n",
+ " document_path_replacement_regex: str = None,\n",
+ " use_layout: bool = False,\n",
+ " aoai_connection_id: str = None,\n",
+ " embeddings_container: Input = None,\n",
+ "):\n",
+ " crack_and_chunk = crack_and_chunk_with_doc_intel_component(\n",
+ " input_data=input_data,\n",
+ " input_glob=data_source_glob,\n",
+ " chunk_size=chunk_size,\n",
+ " use_rcts=True,\n",
+ " data_source_url=data_source_url,\n",
+ " document_path_replacement_regex=document_path_replacement_regex,\n",
+ " doc_intel_connection_id=doc_intel_connection_id,\n",
+ " use_layout=use_layout,\n",
+ " )\n",
+ " use_automatic_compute(crack_and_chunk)\n",
+ "\n",
+ " generate_embeddings = generate_embeddings_component(\n",
+ " chunks_source=crack_and_chunk.outputs.output_chunks,\n",
+ " embeddings_container=embeddings_container,\n",
+ " embeddings_model=embeddings_model,\n",
+ " )\n",
+ "\n",
+ " use_automatic_compute(generate_embeddings)\n",
+ " if optional_pipeline_input_provided(aoai_connection_id):\n",
+ " generate_embeddings.environment_variables[\n",
+ " \"AZUREML_WORKSPACE_CONNECTION_ID_AOAI\"\n",
+ " ] = aoai_connection_id\n",
+ " if optional_pipeline_input_provided(embeddings_container):\n",
+ " # If provided, previous_embeddings is expected to be a URI to an 'embeddings container' folder.\n",
+ " # Each folder under this folder is generated by a `generate_embeddings_component` run and can be reused for subsequent embeddings runs.\n",
+ " generate_embeddings.outputs.embeddings = Output(\n",
+ " type=\"uri_folder\", path=f\"{embeddings_container.path}/{{name}}\"\n",
+ " )\n",
+ "\n",
+ " update_acs_index = update_acs_index_component(\n",
+ " embeddings=generate_embeddings.outputs.embeddings,\n",
+ " acs_config=acs_config,\n",
+ " )\n",
+ " use_automatic_compute(update_acs_index)\n",
+ " if optional_pipeline_input_provided(acs_connection_id):\n",
+ " update_acs_index.environment_variables[\n",
+ " \"AZUREML_WORKSPACE_CONNECTION_ID_ACS\"\n",
+ " ] = acs_connection_id\n",
+ "\n",
+ " register_mlindex = register_mlindex_asset_component(\n",
+ " storage_uri=update_acs_index.outputs.index,\n",
+ " asset_name=asset_name,\n",
+ " )\n",
+ " use_automatic_compute(register_mlindex)\n",
+ " return {\n",
+ " \"mlindex_asset_uri\": update_acs_index.outputs.index,\n",
+ " \"mlindex_asset_id\": register_mlindex.outputs.asset_id,\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Get the connections to Azure OpenAI (for embeddings with `text-embedding-ada-002`) and Azure Cognitive Search."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "aoai_connection = ml_client.connections.get(\"azureml-rag-oai\")\n",
+ "acs_connection = ml_client.connections.get(\"azureml-rag-acs\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create a Custom Connection with details for an Azure AI Document Intelligence Service.\n",
+ "[Setup instructions for Azure AI Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-3.1.0)\n",
+ "\n",
+ "Use the Connections UI in an AzureML Workspace, under the Promptflow tab, to create a connection with these fields: \n",
+ "\n",
+ "It's not yet supported to create/retrieve Custom Connections using SDK, so you will need to create it using the UI and we'll use string replacement below to get the ID for this custom connection to pass to our pipeline."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "document_intelligence_connection_id = aoai_connection.id.replace(\n",
+ " \"azureml-rag-oai\", \"azureml-rag-documentintelligence\"\n",
+ ")\n",
+ "document_intelligence_connection_id"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "from azure.ai.ml import Input\n",
+ "\n",
+ "embeddings_model = (\n",
+ " \"azure_open_ai://deployment/text-embedding-ada-002/model/text-embedding-ada-002\"\n",
+ ")\n",
+ "\n",
+ "data_source = Path.cwd() / \"pdfs\"\n",
+ "asset_name = f\"doc_intel_{embeddings_model.split(':')[0]}_acs\"\n",
+ "\n",
+ "pipeline_job = uri_to_acs(\n",
+ " input_data=Input(type=\"uri_folder\", path=str(data_source)),\n",
+ " data_source_glob=\"**/*\",\n",
+ " data_source_url=None,\n",
+ " document_path_replacement_regex=None,\n",
+ " doc_intel_connection_id=document_intelligence_connection_id,\n",
+ " use_layout=False,\n",
+ " embeddings_model=embeddings_model,\n",
+ " aoai_connection_id=aoai_connection.id,\n",
+ " embeddings_container=Input(\n",
+ " type=\"uri_folder\",\n",
+ " path=f\"azureml://datastores/workspaceblobstore/paths/embeddings/{asset_name}\",\n",
+ " ),\n",
+ " acs_config=json.dumps(\n",
+ " {\n",
+ " \"index_name\": asset_name,\n",
+ " }\n",
+ " ),\n",
+ " acs_connection_id=acs_connection.id,\n",
+ " asset_name=asset_name,\n",
+ ")\n",
+ "pipeline_job.display_name = asset_name\n",
+ "\n",
+ "# Properties for Vector Index UI\n",
+ "pipeline_job.properties[\"azureml.mlIndexAssetName\"] = asset_name\n",
+ "pipeline_job.properties[\"azureml.mlIndexAssetKind\"] = \"acs\"\n",
+ "pipeline_job.properties[\"azureml.mlIndexAssetSource\"] = \"AzureML Data\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"Submitting pipeline job to experiment: {asset_name}\")\n",
+ "running_pipeline_job = ml_client.jobs.create_or_update(\n",
+ " pipeline_job, experiment_name=asset_name\n",
+ ")\n",
+ "\n",
+ "print(f\"Submitted run, url: {running_pipeline_job.studio_url}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ml_client.jobs.stream(running_pipeline_job.name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from azureml.rag.mlindex import MLIndex\n",
+ "\n",
+ "question = \"What is RAG?\"\n",
+ "\n",
+ "retriever = MLIndex(\n",
+ " ml_client.data.get(asset_name, label=\"latest\")\n",
+ ").as_langchain_retriever()\n",
+ "retriever.get_relevant_documents(question)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "project-baker310",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/doc_intel_env/Dockerfile b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/doc_intel_env/Dockerfile
new file mode 100644
index 0000000000..ab6f08fe22
--- /dev/null
+++ b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/doc_intel_env/Dockerfile
@@ -0,0 +1,4 @@
+FROM mcr.microsoft.com/azureml/curated/llm-rag-embeddings:latest
+
+RUN pip install -U azureml-rag>=0.2.2
+RUN pip install azure-ai-formrecognizer
diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/pdfs/retrieval-augmented-generation-for-knowledge-Intensive-nlp-tasks_arXiv-2005.11401v4.pdf b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/pdfs/retrieval-augmented-generation-for-knowledge-Intensive-nlp-tasks_arXiv-2005.11401v4.pdf
new file mode 100644
index 0000000000..39218379c6
Binary files /dev/null and b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk/pdfs/retrieval-augmented-generation-for-knowledge-Intensive-nlp-tasks_arXiv-2005.11401v4.pdf differ