Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Integration of Azure DevOps Client Adapter with BaseGithubClient #761

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions llama_hub/github_repo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,54 @@ for doc in docs:
print(doc.extra_info)
```

### Azure DevOps

```bash
export AZURE_DEVOPS_BASEURL='...'
export AZURE_DEVOPS_USERNAME='...'
export AZURE_DEVOPS_PASSWORD='...'
```

```python
import os

from llama_index import download_loader
download_loader("GithubRepositoryReader")

from llama_hub.github_repo import GithubRepositoryReader, AzureDevOpsAdapter

# Example: https://dev.azure.com/ahmetkarapinar/testProject/_git/testProject/commit/08633d3844192a69ab5011c20201dba3aced0a41?refName=refs%2Fheads%2Fmaster
# 'ahmetkarapinar' is organization id
# 'testProject' is project id
# 'testProject' is repository id
# '08633d3844192a69ab5011c20201dba3aced0a41' commit sha
# 'master' branch name


azure_devops_adapter = AzureDevOpsAdapter(
base_url=os.environ["AZURE_DEVOPS_BASE_URL"], # Ex. 'https://dev.azure.com/YOURORG'
username=os.environ["AZURE_DEVOPS_USERNAME"],
password=os.environ["AZURE_DEVOPS_PASSWORD"],
)

loader = GithubRepositoryReader(
github_client = azure_devops_adapter,
owner = "<your_project_id_goes_here>",
repo = "<your_repository_id_goes_here>",
filter_directories = (["llama_index", "docs"], GithubRepositoryReader.FilterType.INCLUDE),
filter_file_extensions = ([".py"], GithubRepositoryReader.FilterType.INCLUDE),
verbose = True,
concurrent_requests = 10,
)

docs = loader.load_data(branch="main")
# alternatively, load from a specific commit:
# docs = loader.load_data(commit_sha="a6c89159bf8e7086bea2f4305cff3f0a4102e370")

for doc in docs:
print(doc.extra_info)
```

## Examples

This loader designed to be used as a way to load data into [Llama Index](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.
Expand Down
250 changes: 250 additions & 0 deletions llama_hub/github_repo/azure_devops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
"""
Azure DevOps Client Adapter for BaseGithubClient.

This class is used to interact with Azure DevOps repositories. It uses the azure-devops package.
The implementation is merely a workaround to use the same code for Github and Azure DevOps.
"""

from typing import Any, Dict, List, Optional
from llama_hub.github_repo.github_client import (
BaseGithubClient,
GitBlobResponseModel,
GitBranchResponseModel,
GitCommitResponseModel,
GitTreeResponseModel,
)

from azure.devops.v7_0.git.git_client import GitClient
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move these import inside the class to avoid any build failing scenario

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi anoop, how can these imports cause build failure. I was able to run it. Is there a way to build it locally? if so, how can I do it?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, if you check the other loaders they also follow lazy importing libraries.

from azure.devops.v7_0.git.models import GitTreeRef
from azure.devops.v7_0.git.models import GitTreeEntryRef
from azure.devops.v7_0.git.models import GitBlobRef
from azure.devops.v7_0.git.models import GitCommit
from azure.devops.v7_0.git.models import GitBranchStats


class AzureDevOpsAdapter(BaseGithubClient):
"""
Azure DevOps adapter.

This class is used to interact with Azure DevOps repositories. It uses the azure-devops package.
Each method is same as the corresponding method in BaseGithubClient. All of the Azure DevOps specific
response models are converted to the corresponding Github response models.

Args:
- `base_url (str)`: Azure DevOps base url. Example: 'https://dev.azure.com/YOURORG'
- `username (str)`: Azure DevOps username. You can leave this blank if you are using a PAT. ex: ''
- `password (str)`: Azure DevOps password. Personal Access Token (PAT) is recommended.

Raises:
- `ImportError`: If azure-devops package is not installed.
- `ValueError`: If base_url, username or password is not provided.
"""

def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
try:
from azure.devops.connection import Connection
from msrest.authentication import BasicAuthentication
except ImportError:
raise ImportError(
"Please install azure-devops package to use Azure DevOps adapter"
)
if kwargs.get("base_url") is None:
raise ValueError(
"Azure DevOps base_url is required. Example: 'https://dev.azure.com/YOURORG'"
)
if kwargs.get("username") is None:
raise ValueError(
"Azure DevOps username is required. You can leave this blank if you are using a PAT. ex: ''"
)
if kwargs.get("password") is None:
raise ValueError(
"Azure DevOps password is required. Personal Access Token (PAT) is recommended."
)

self.connection = Connection(
base_url=kwargs.get("base_url"),
creds=BasicAuthentication(
username=kwargs.get("username"),
password=kwargs.get("password"),
),
)
self._git_client: GitClient = self.connection.clients.get_git_client()

def get_all_endpoints(self) -> Dict[str, str]:
raise NotImplementedError

async def request(
self,
endpoint: str,
method: str,
headers: Dict[str, Any] = {},
**kwargs: Any,
) -> Any:
raise NotImplementedError

async def get_tree(
self,
owner: str,
repo: str,
tree_sha: str,
) -> GitTreeResponseModel:
"""
Get the tree for a given sha.

Args:
- `owner (str)`: Project name or project id.
- `repo (str)`: repository id.
- `tree_sha (str)`: sha of the tree.

Returns:
- `tree (GitTreeResponseModel)`: Tree response model.
"""
_git_tree_response: GitTreeRef = self._git_client.get_tree(
repository_id=repo,
sha1=tree_sha,
project=owner,
)

git_tree_object_list: List[GitTreeResponseModel.GitTreeObject] = []
tree_entry: GitTreeEntryRef
for tree_entry in _git_tree_response.tree_entries:
git_tree_object: GitTreeResponseModel.GitTreeObject = (
GitTreeResponseModel.GitTreeObject(
path=tree_entry.relative_path,
mode=tree_entry.mode,
type=tree_entry.git_object_type,
sha=tree_entry.object_id,
url=tree_entry.url,
size=tree_entry.size,
)
)
git_tree_object_list.append(git_tree_object)
return GitTreeResponseModel(
sha=_git_tree_response.object_id,
url=_git_tree_response.url,
tree=git_tree_object_list,
truncated=False,
)

async def get_blob(
self,
owner: str,
repo: str,
file_sha: str,
) -> GitBlobResponseModel:
"""
Get the blob for a given sha.

Args:
- `owner (str)`: Project name or project id.
- `repo (str)`: repository id.
- `file_sha (str)`: sha of the blob.

Returns:
- `blob (GitBlobResponseModel)`: Blob response model.
"""
_git_blob_response: GitBlobRef = self._git_client.get_blob(
repository_id=repo,
sha1=file_sha,
project=owner,
download=False,
resolve_lfs=False,
)

_git_blob_content_iterator = self._git_client.get_blob_content(
repository_id=repo,
sha1=file_sha,
project=owner,
download=False,
resolve_lfs=False,
)

size = 0
_git_blob_content: bytes = b""
for chunk in _git_blob_content_iterator:
_git_blob_content += chunk
size += len(chunk)

return GitBlobResponseModel(
content=_git_blob_content,
size=size,
encoding="utf-8",
sha=_git_blob_response.object_id,
url=_git_blob_response.url,
node_id=None,
)

async def get_commit(
self,
owner: str,
repo: str,
commit_sha: str,
) -> GitCommitResponseModel:
"""
Get the commit for a given sha.

Args:
- `owner (str)`: Project name or project id.
- `repo (str)`: repository id.
- `commit_sha (str)`: sha of the commit.

Returns:
- `commit (GitCommitResponseModel)`: Commit response model.
"""
_git_commit_response: GitCommit = self._git_client.get_commit(
repository_id=repo,
commit_id=commit_sha,
project=owner,
)

return GitCommitResponseModel(
url=_git_commit_response.url,
sha=_git_commit_response.commit_id,
commit=GitCommitResponseModel.Commit(
tree=GitCommitResponseModel.Commit.Tree(
sha=_git_commit_response.tree_id,
),
),
)

async def get_branch(
self,
owner: str,
repo: str,
branch: Optional[str],
branch_name: Optional[str],
) -> GitBranchResponseModel:
"""
Get the branch for a given branch name.

Args:
- `owner (str)`: Project name or project id.
- `repo (str)`: repository id.
- `branch (str)`: branch name.

Returns:
- `branch (GitBranchResponseModel)`: Branch response model.
"""
_git_branch_response: GitBranchStats = self._git_client.get_branch(
repository_id=repo, project=owner, name=branch
)

# get the latest commit for the branch
_git_commit_response: GitCommit = self._git_client.get_commit(
repository_id=repo,
commit_id=_git_branch_response.commit.commit_id,
project=owner,
)

return GitBranchResponseModel(
name=_git_branch_response.name,
commit=GitBranchResponseModel.Commit(
commit=GitBranchResponseModel.Commit.Commit(
tree=GitBranchResponseModel.Commit.Commit.Tree(
sha=_git_commit_response.tree_id,
),
),
),
_links=None,
)
21 changes: 14 additions & 7 deletions llama_hub/github_repo/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from llama_index.readers.base import BaseReader
from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS
from llama_index.readers.schema.base import Document
from llama_hub.github_repo import github_client

from llama_hub.github_repo.github_client import (
BaseGithubClient,
Expand Down Expand Up @@ -247,7 +248,7 @@ def _load_data_from_branch(self, branch: str) -> List[Document]:
:return: list of documents
"""
branch_data: GitBranchResponseModel = self._loop.run_until_complete(
self._github_client.get_branch(self._owner, self._repo, branch)
self._github_client.get_branch(self._owner, self._repo, branch, branch)
)

tree_sha = branch_data.commit.commit.tree.sha
Expand Down Expand Up @@ -393,7 +394,7 @@ async def _generate_documents(
async for blob_data, full_path in buffered_iterator:
print_if_verbose(self._verbose, f"generating document for {full_path}")
assert (
blob_data.encoding == "base64"
blob_data.encoding == "base64" or blob_data.encoding == "utf-8"
), f"blob encoding {blob_data.encoding} not supported"
decoded_bytes = None
try:
Expand All @@ -403,7 +404,13 @@ async def _generate_documents(
print_if_verbose(
self._verbose, f"could not decode {full_path} as base64"
)
continue
# tried to decode the content that was base64 encoded but failed
# continue
if blob_data.encoding == "base64":
continue
# if the content was not base64 encoded and we failed to decode it
# as base64, then we assume it is raw text
decoded_bytes = blob_data.content

if self._use_parser:
document = self._parse_supported_file(
Expand Down Expand Up @@ -547,7 +554,7 @@ def wrapper(*args: Any, **kwargs: Any) -> None:
verbose=True,
filter_directories=(
["docs"],
GithubRepositoryReader.FilterType.INCLUDE,
GithubRepositoryReader.FilterType.EXCLUDE,
),
filter_file_extensions=(
[
Expand All @@ -557,7 +564,7 @@ def wrapper(*args: Any, **kwargs: Any) -> None:
".gif",
".svg",
".ico",
"json",
".json",
".ipynb",
],
GithubRepositoryReader.FilterType.EXCLUDE,
Expand All @@ -584,6 +591,6 @@ def load_data_from_branch() -> None:

load_data_from_branch()

# input("Press enter to load github repository from commit sha...")
input("Press enter to load github repository from commit sha...")

# load_data_from_commit()
load_data_from_commit()
3 changes: 2 additions & 1 deletion llama_hub/github_repo/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
httpx
httpx
azure-devops