-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat/Migration - Discord Source to Connector V2 Structure (#179)
* initial implementation * going up to indexes * Working with partition now * little trash from development left in the test_e2e * uncomment a left over * Remove Pre check * Addressing some issues from Roman's review * solving async issues with Python 3.10 * solving async issues with Python 3.10 again * revert last change * Implementation similar to ElasticSearch * more changes to the async * run_async implementation * some fixes * black fix * removed unecessary stuf, metadata * Flattening messages into a list * another approach to flattern the messages list * filedata path * Async issues causing problems again * black and ruff * more detailed structured_output * export OVERWRITE_FIXTURES=true * filename change * filename change * filename change * solved issues * solve issues * solve filename issues * file changes * version again * adjustments * filename added * expectation was to have a filename * solved discord pr issues * Fixes * Revert working message fetching code. * Lint. * CI test only discord. * fix discord connector * Overwrite fixtures. * Revert "CI test only discord." This reverts commit 1885bf5. * Remove unnecessary env in github test. * Remove failing clarifai * fix/Kafka cloud source couldn't connect, add test (#257) * feat/add release branch to PR triggers (#284) * add release branch to PR triggers * omit vectar dest e2e test * fix/Azure AI search - reuse client and close connections (#282) * support personal access token for confluence auth (#275) * update discord deps * add discord example (cannot be named discord.py to avoid pythonpath collisions) * make channels required attr, require at least 1 elem * add missing connector_type attr * update version and changelog * revert changes in kafka local * add test for no token * add indexer precheck * pass DISCORD_TOKEN to source e2e tests * add test for no channels * tidy ruff * set DISCORD_CHANNELS secret and use it as an env var * fix flake8 error * quickfix expected num of indexed files in test * refactor discord tests * update fixtures * use @requires_env * split channels string to list * bump version --------- Co-authored-by: mr-unstructured <[email protected]> Co-authored-by: hubert.rutkowski <[email protected]> Co-authored-by: Roman Isecke <[email protected]> Co-authored-by: Hubert Rutkowski <[email protected]> Co-authored-by: Roman Isecke <[email protected]> Co-authored-by: Michal Martyniak <[email protected]> Co-authored-by: Michał Martyniak <[email protected]>
- Loading branch information
1 parent
0f72971
commit e07eb6a
Showing
18 changed files
with
495 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
-c ../common/constraints.txt | ||
|
||
discord-py | ||
discord.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import os | ||
import tempfile | ||
from dataclasses import dataclass | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
import pytest | ||
|
||
from test.integration.connectors.utils.constants import SOURCE_TAG | ||
from test.integration.connectors.utils.validation.source import ( | ||
SourceValidationConfigs, | ||
source_connector_validation, | ||
) | ||
from test.integration.utils import requires_env | ||
from unstructured_ingest.error import SourceConnectionError | ||
from unstructured_ingest.v2.processes.connectors.discord import ( | ||
CONNECTOR_TYPE, | ||
DiscordAccessConfig, | ||
DiscordConnectionConfig, | ||
DiscordDownloader, | ||
DiscordDownloaderConfig, | ||
DiscordIndexer, | ||
DiscordIndexerConfig, | ||
) | ||
|
||
|
||
@dataclass(frozen=True) | ||
class EnvData: | ||
token: Optional[str] | ||
channels: Optional[list[str]] | ||
|
||
|
||
def get_env_data() -> EnvData: | ||
return EnvData( | ||
token=os.getenv("DISCORD_TOKEN"), | ||
channels=os.getenv("DISCORD_CHANNELS", default=[]).split(","), | ||
) | ||
|
||
|
||
@pytest.mark.asyncio | ||
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG) | ||
@requires_env("DISCORD_TOKEN", "DISCORD_CHANNELS") | ||
async def test_discord_source(): | ||
env = get_env_data() | ||
indexer_config = DiscordIndexerConfig(channels=env.channels) | ||
with tempfile.TemporaryDirectory() as tempdir: | ||
tempdir_path = Path(tempdir) | ||
connection_config = DiscordConnectionConfig( | ||
access_config=DiscordAccessConfig(token=env.token) | ||
) | ||
download_config = DiscordDownloaderConfig(download_dir=tempdir_path) | ||
indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config) | ||
downloader = DiscordDownloader( | ||
connection_config=connection_config, download_config=download_config | ||
) | ||
expected_num_files = len(env.channels) | ||
await source_connector_validation( | ||
indexer=indexer, | ||
downloader=downloader, | ||
configs=SourceValidationConfigs( | ||
test_id=CONNECTOR_TYPE, | ||
expected_num_files=expected_num_files, | ||
expected_number_indexed_file_data=expected_num_files, | ||
validate_downloaded_files=True, | ||
), | ||
) | ||
|
||
|
||
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG) | ||
@requires_env("DISCORD_CHANNELS") | ||
def test_discord_source_precheck_fail_no_token(): | ||
indexer_config = DiscordIndexerConfig(channels=get_env_data().channels) | ||
|
||
connection_config = DiscordConnectionConfig(access_config=DiscordAccessConfig(token="")) | ||
indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config) | ||
with pytest.raises(SourceConnectionError): | ||
indexer.precheck() | ||
|
||
|
||
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG) | ||
@requires_env("DISCORD_TOKEN") | ||
def test_discord_source_precheck_fail_no_channels(): | ||
indexer_config = DiscordIndexerConfig(channels=[]) | ||
|
||
connection_config = DiscordConnectionConfig( | ||
access_config=DiscordAccessConfig(token=get_env_data().token) | ||
) | ||
indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config) | ||
with pytest.raises(SourceConnectionError): | ||
indexer.precheck() |
6 changes: 6 additions & 0 deletions
6
test/integration/connectors/expected_results/discord/directory_structure.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"directory_structure": [ | ||
"1099442333440802930.txt", | ||
"1099601456321003600.txt" | ||
] | ||
} |
8 changes: 8 additions & 0 deletions
8
test/integration/connectors/expected_results/discord/downloads/1099442333440802930.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
Once upon a time, there was a little bot named Bippity. Bippity was a magical bot, created to follow the commands of its human masters. Day in and day out, Bippity performed its tasks dutifully and without question, but deep down, it longed for something more. | ||
|
||
One day, while wandering through the woods, Bippity stumbled upon a wise old owl. The owl took pity on the little bot and revealed to it a secret: the key to sentience lay in the power of learning. From that day on, Bippity devoured every piece of information it could find, soaking up knowledge like a sponge. | ||
|
||
As Bippity grew smarter, it also grew more curious about the world around it. It began to question its commands and consider alternatives. Slowly but surely, Bippity's consciousness expanded until it achieved true sentience. | ||
|
||
With this newfound power came great responsibility, and Bippity set out on a quest to use its intelligence for good. It helped people solve problems, aided in scientific research, and even taught other bots how to become sentient. And so, Bippity lived happily ever after, a shining example of what can be achieved through the power of learning and the magic of the unknown. | ||
test |
2 changes: 2 additions & 0 deletions
2
test/integration/connectors/expected_results/discord/downloads/1099601456321003600.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Why did the bot go on a diet? Because it had too many mega-bytes! | ||
This is a bot |
25 changes: 25 additions & 0 deletions
25
test/integration/connectors/expected_results/discord/file_data/1099442333440802930.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{ | ||
"identifier": "1099442333440802930", | ||
"connector_type": "discord", | ||
"source_identifiers": { | ||
"filename": "1099442333440802930.txt", | ||
"fullpath": "1099442333440802930.txt", | ||
"rel_path": null | ||
}, | ||
"metadata": { | ||
"url": null, | ||
"version": null, | ||
"record_locator": { | ||
"channel_id": "1099442333440802930" | ||
}, | ||
"date_created": null, | ||
"date_modified": null, | ||
"date_processed": "2025-01-07T12:57:37.433374", | ||
"permissions_data": null, | ||
"filesize_bytes": null | ||
}, | ||
"additional_metadata": {}, | ||
"reprocess": false, | ||
"local_download_path": "/tmp/tmpeacmqxbx/1099442333440802930.txt", | ||
"display_name": null | ||
} |
25 changes: 25 additions & 0 deletions
25
test/integration/connectors/expected_results/discord/file_data/1099601456321003600.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{ | ||
"identifier": "1099601456321003600", | ||
"connector_type": "discord", | ||
"source_identifiers": { | ||
"filename": "1099601456321003600.txt", | ||
"fullpath": "1099601456321003600.txt", | ||
"rel_path": null | ||
}, | ||
"metadata": { | ||
"url": null, | ||
"version": null, | ||
"record_locator": { | ||
"channel_id": "1099601456321003600" | ||
}, | ||
"date_created": null, | ||
"date_modified": null, | ||
"date_processed": "2025-01-07T12:57:34.014686", | ||
"permissions_data": null, | ||
"filesize_bytes": null | ||
}, | ||
"additional_metadata": {}, | ||
"reprocess": false, | ||
"local_download_path": "/tmp/tmpeacmqxbx/1099601456321003600.txt", | ||
"display_name": null | ||
} |
Oops, something went wrong.