Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented connector validation for confluence #3632

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def confluence_group_sync(
credentials=cc_pair.credential.credential_json,
is_cloud=cc_pair.connector.connector_specific_config.get("is_cloud", False),
wiki_base=cc_pair.connector.connector_specific_config["wiki_base"],
test_query="type=page",
)

group_member_email_map = _build_group_member_email_map(
Expand Down
24 changes: 23 additions & 1 deletion backend/onyx/connectors/confluence/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
from onyx.configs.constants import DocumentSource
from onyx.connectors.confluence.onyx_confluence import build_confluence_client
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
from onyx.connectors.confluence.onyx_confluence import validate_connector_configuration
from onyx.connectors.confluence.utils import attachment_to_content
from onyx.connectors.confluence.utils import build_confluence_document_id
from onyx.connectors.confluence.utils import datetime_from_string
from onyx.connectors.confluence.utils import extract_text_from_confluence_html
from onyx.connectors.confluence.utils import validate_attachment_filetype
from onyx.connectors.interfaces import ConnectorValidator
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
from onyx.connectors.interfaces import LoadConnector
Expand Down Expand Up @@ -76,7 +78,9 @@
)


class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
class ConfluenceConnector(
LoadConnector, PollConnector, SlimConnector, ConnectorValidator
):
def __init__(
self,
wiki_base: str,
Expand Down Expand Up @@ -144,6 +148,7 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None
credentials=credentials,
is_cloud=self.is_cloud,
wiki_base=self.wiki_base,
test_query=self.base_cql_page_query,
)
return None

Expand Down Expand Up @@ -378,3 +383,20 @@ def retrieve_all_slim_documents(
doc_metadata_list = doc_metadata_list[_SLIM_DOC_BATCH_SIZE:]

yield doc_metadata_list

def validate_connector_configuration(self) -> None:
"""
This will raise an exception if either the
credentials or the connector configuration are invalid.

This is determined by trying to connect to Confluence
and retrieving a list of spaces (with a limit of 1 so it
doesn't take too long).
"""
validate_connector_configuration(
confluence_client=self.confluence_client,
# Let it retry but it shouldn't be too long
max_backoff_retries=2,
max_backoff_seconds=2,
test_query=self.base_cql_page_query,
)
90 changes: 61 additions & 29 deletions backend/onyx/connectors/confluence/onyx_confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@

from atlassian import Confluence # type:ignore
from requests import HTTPError
from requests.exceptions import MissingSchema

from onyx.connectors.interfaces import InvalidConnectorConfigurationException
from onyx.connectors.interfaces import InvalidConnectorException
from onyx.connectors.interfaces import InvalidCredentialsException
from onyx.utils.logger import setup_logger

logger = setup_logger()
Expand Down Expand Up @@ -337,23 +341,55 @@ def paginated_group_members_retrieval(
yield from self._paginate_url(f"rest/api/group/{group_name}/member", limit)


def _validate_connector_configuration(
credentials: dict[str, Any],
is_cloud: bool,
wiki_base: str,
def validate_connector_configuration(
confluence_client: OnyxConfluence,
max_backoff_retries: int,
max_backoff_seconds: int,
test_query: str,
) -> None:
# test connection with direct client, no retries
confluence_client_with_minimal_retries = Confluence(
api_version="cloud" if is_cloud else "latest",
url=wiki_base.rstrip("/"),
username=credentials["confluence_username"] if is_cloud else None,
password=credentials["confluence_access_token"] if is_cloud else None,
token=credentials["confluence_access_token"] if not is_cloud else None,
backoff_and_retry=True,
max_backoff_retries=6,
max_backoff_seconds=10,
)
spaces = confluence_client_with_minimal_retries.get_all_spaces(limit=1)
"""
This function will test the connection to Confluence by retrieving a list of spaces.
If no spaces are found, it will raise an exception.
"""
# We want to test the connection with the base client, not the wrapped one
# because the wrapped one has retries built in and we want this to be fast
try:
confluence_client_with_minimal_retries = Confluence(
api_version=confluence_client.api_version,
url=confluence_client.url,
username=confluence_client.username,
password=confluence_client.password,
token=confluence_client.token
if hasattr(confluence_client, "token")
else None,
backoff_and_retry=True,
max_backoff_retries=max_backoff_retries,
max_backoff_seconds=max_backoff_seconds,
)
response = confluence_client_with_minimal_retries.get(
f"rest/api/content/search?cql={test_query}&limit=1",
advanced_mode=True,
)
response.raise_for_status()
except Exception as e:
if isinstance(e, HTTPError):
if e.response.status_code == 403:
raise InvalidCredentialsException(str(e))
if e.response.status_code in [401, 404]:
raise InvalidConnectorConfigurationException(str(e))
elif isinstance(e, MissingSchema):
raise InvalidConnectorConfigurationException(str(e))

raise InvalidConnectorException(str(e))

# All confluence clients should be able to retrieve at least one space
# If not, there is an issue with the credentials or the connector configuration
pages = response.json().get("results", [])
if not pages:
raise InvalidConnectorConfigurationException(
f"No pages found for query {test_query}! "
"check your connector Space/Page/CQL Query Settings "
)

# uncomment the following for testing
# the following is an attempt to retrieve the user's timezone
Expand All @@ -362,25 +398,14 @@ def _validate_connector_configuration(
# space_key = spaces["results"][0]["key"]
# space_details = confluence_client_with_minimal_retries.cql(f"space.key={space_key}+AND+type=space")

if not spaces:
raise RuntimeError(
f"No spaces found at {wiki_base}! "
"Check your credentials and wiki_base and make sure "
"is_cloud is set correctly."
)


def build_confluence_client(
credentials: dict[str, Any],
is_cloud: bool,
wiki_base: str,
test_query: str,
) -> OnyxConfluence:
_validate_connector_configuration(
credentials=credentials,
is_cloud=is_cloud,
wiki_base=wiki_base,
)
return OnyxConfluence(
onyx_confluence_client = OnyxConfluence(
api_version="cloud" if is_cloud else "latest",
# Remove trailing slash from wiki_base if present
url=wiki_base.rstrip("/"),
Expand All @@ -393,3 +418,10 @@ def build_confluence_client(
max_backoff_seconds=60,
cloud=is_cloud,
)
validate_connector_configuration(
confluence_client=onyx_confluence_client,
max_backoff_retries=6,
max_backoff_seconds=10,
test_query=test_query,
)
return onyx_confluence_client
163 changes: 87 additions & 76 deletions backend/onyx/connectors/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@
from onyx.connectors.guru.connector import GuruConnector
from onyx.connectors.hubspot.connector import HubSpotConnector
from onyx.connectors.interfaces import BaseConnector
from onyx.connectors.interfaces import ConnectorValidator
from onyx.connectors.interfaces import EventConnector
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.linear.connector import LinearConnector
from onyx.connectors.loopio.connector import LoopioConnector
from onyx.connectors.mediawiki.wiki import MediaWikiConnector
Expand All @@ -57,82 +59,84 @@ class ConnectorMissingException(Exception):
pass


def identify_connector_class(
source: DocumentSource,
input_type: InputType | None = None,
) -> Type[BaseConnector]:
connector_map = {
DocumentSource.WEB: WebConnector,
DocumentSource.FILE: LocalFileConnector,
DocumentSource.SLACK: {
InputType.POLL: SlackPollConnector,
InputType.SLIM_RETRIEVAL: SlackPollConnector,
},
DocumentSource.GITHUB: GithubConnector,
DocumentSource.GMAIL: GmailConnector,
DocumentSource.GITLAB: GitlabConnector,
DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector,
DocumentSource.BOOKSTACK: BookstackConnector,
DocumentSource.CONFLUENCE: ConfluenceConnector,
DocumentSource.JIRA: JiraConnector,
DocumentSource.PRODUCTBOARD: ProductboardConnector,
DocumentSource.SLAB: SlabConnector,
DocumentSource.NOTION: NotionConnector,
DocumentSource.ZULIP: ZulipConnector,
DocumentSource.GURU: GuruConnector,
DocumentSource.LINEAR: LinearConnector,
DocumentSource.HUBSPOT: HubSpotConnector,
DocumentSource.DOCUMENT360: Document360Connector,
DocumentSource.GONG: GongConnector,
DocumentSource.GOOGLE_SITES: GoogleSitesConnector,
DocumentSource.ZENDESK: ZendeskConnector,
DocumentSource.LOOPIO: LoopioConnector,
DocumentSource.DROPBOX: DropboxConnector,
DocumentSource.SHAREPOINT: SharepointConnector,
DocumentSource.TEAMS: TeamsConnector,
DocumentSource.SALESFORCE: SalesforceConnector,
DocumentSource.DISCOURSE: DiscourseConnector,
DocumentSource.AXERO: AxeroConnector,
DocumentSource.CLICKUP: ClickupConnector,
DocumentSource.MEDIAWIKI: MediaWikiConnector,
DocumentSource.WIKIPEDIA: WikipediaConnector,
DocumentSource.ASANA: AsanaConnector,
DocumentSource.S3: BlobStorageConnector,
DocumentSource.R2: BlobStorageConnector,
DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector,
DocumentSource.OCI_STORAGE: BlobStorageConnector,
DocumentSource.XENFORO: XenforoConnector,
DocumentSource.DISCORD: DiscordConnector,
DocumentSource.FRESHDESK: FreshdeskConnector,
DocumentSource.FIREFLIES: FirefliesConnector,
DocumentSource.EGNYTE: EgnyteConnector,
DocumentSource.AIRTABLE: AirtableConnector,
}
connector_by_source = connector_map.get(source, {})

if isinstance(connector_by_source, dict):
if input_type is None:
# If not specified, default to most exhaustive update
connector = connector_by_source.get(InputType.LOAD_STATE)
else:
connector = connector_by_source.get(input_type)
else:
connector = connector_by_source
if connector is None:
raise ConnectorMissingException(f"Connector not found for source={source}")
_CONNECTOR_MAP: dict[DocumentSource, Type[BaseConnector]] = {
DocumentSource.WEB: WebConnector,
DocumentSource.FILE: LocalFileConnector,
DocumentSource.SLACK: SlackPollConnector,
DocumentSource.GITHUB: GithubConnector,
DocumentSource.GMAIL: GmailConnector,
DocumentSource.GITLAB: GitlabConnector,
DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector,
DocumentSource.BOOKSTACK: BookstackConnector,
DocumentSource.CONFLUENCE: ConfluenceConnector,
DocumentSource.JIRA: JiraConnector,
DocumentSource.PRODUCTBOARD: ProductboardConnector,
DocumentSource.SLAB: SlabConnector,
DocumentSource.NOTION: NotionConnector,
DocumentSource.ZULIP: ZulipConnector,
DocumentSource.GURU: GuruConnector,
DocumentSource.LINEAR: LinearConnector,
DocumentSource.HUBSPOT: HubSpotConnector,
DocumentSource.DOCUMENT360: Document360Connector,
DocumentSource.GONG: GongConnector,
DocumentSource.GOOGLE_SITES: GoogleSitesConnector,
DocumentSource.ZENDESK: ZendeskConnector,
DocumentSource.LOOPIO: LoopioConnector,
DocumentSource.DROPBOX: DropboxConnector,
DocumentSource.SHAREPOINT: SharepointConnector,
DocumentSource.TEAMS: TeamsConnector,
DocumentSource.SALESFORCE: SalesforceConnector,
DocumentSource.DISCOURSE: DiscourseConnector,
DocumentSource.AXERO: AxeroConnector,
DocumentSource.CLICKUP: ClickupConnector,
DocumentSource.MEDIAWIKI: MediaWikiConnector,
DocumentSource.WIKIPEDIA: WikipediaConnector,
DocumentSource.ASANA: AsanaConnector,
DocumentSource.S3: BlobStorageConnector,
DocumentSource.R2: BlobStorageConnector,
DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector,
DocumentSource.OCI_STORAGE: BlobStorageConnector,
DocumentSource.XENFORO: XenforoConnector,
DocumentSource.DISCORD: DiscordConnector,
DocumentSource.FRESHDESK: FreshdeskConnector,
DocumentSource.FIREFLIES: FirefliesConnector,
DocumentSource.EGNYTE: EgnyteConnector,
DocumentSource.AIRTABLE: AirtableConnector,
}

if any(
[
input_type == InputType.LOAD_STATE
and not issubclass(connector, LoadConnector),
input_type == InputType.POLL and not issubclass(connector, PollConnector),
input_type == InputType.EVENT and not issubclass(connector, EventConnector),
]
):
raise ConnectorMissingException(
f"Connector for source={source} does not accept input_type={input_type}"
)
return connector

class ConnectorCannotHandleInputTypeException(Exception):
pass


_INPUT_TYPE_MAP: dict[
InputType,
Type[
LoadConnector
| PollConnector
| EventConnector
| ConnectorValidator
| SlimConnector
],
] = {
InputType.LOAD_STATE: LoadConnector,
InputType.POLL: PollConnector,
InputType.EVENT: EventConnector,
InputType.VALIDATE_CONFIGURATION: ConnectorValidator,
InputType.SLIM_RETRIEVAL: SlimConnector,
}


def connector_can_handle_type(
connector: Type[BaseConnector], input_type: InputType
) -> bool:
return issubclass(connector, _INPUT_TYPE_MAP[input_type])


def get_connector_class(
source: DocumentSource,
) -> Type[BaseConnector] | None:
return _CONNECTOR_MAP.get(source)


def instantiate_connector(
Expand All @@ -143,7 +147,14 @@ def instantiate_connector(
credential: Credential,
tenant_id: str | None = None,
) -> BaseConnector:
connector_class = identify_connector_class(source, input_type)
connector_class = get_connector_class(source)
if connector_class is None:
raise ConnectorMissingException(f"Connector not found for source={source}")

if not connector_can_handle_type(connector_class, input_type):
raise ConnectorCannotHandleInputTypeException(
f"Connector {connector_class} does not accept input_type={input_type}"
)

if source in DocumentSourceRequiringTenantContext:
connector_specific_config["tenant_id"] = tenant_id
Expand Down
Loading
Loading