Skip to content

Commit

Permalink
fix fsspec metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
rbiseck3 committed Oct 2, 2024
1 parent 1475a40 commit 6cc4765
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 6 deletions.
3 changes: 3 additions & 0 deletions unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ class DropboxIndexer(FsspecIndexer):
index_config: DropboxIndexerConfig
connector_type: str = CONNECTOR_TYPE

def get_path(self, file_data: dict) -> str:
return file_data["name"].lstrip("/")

def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
path = file_data["name"].lstrip("/")
date_created = None
Expand Down
14 changes: 8 additions & 6 deletions unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,19 +133,21 @@ def get_file_data(self) -> list[dict[str, Any]]:
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
raise NotImplementedError()

def sterilize_info(self, path) -> dict:
info = self.fs.info(path=path)
return sterilize_dict(data=info)
def get_path(self, file_data: dict) -> str:
return file_data["name"]

def sterilize_info(self, file_data: dict) -> dict:
return sterilize_dict(data=file_data)

def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
files = self.get_file_data()
for file_data in files:
file_path = file_data["key"]
file_path = self.get_path(file_data=file_data)
# Note: we remove any remaining leading slashes (Box introduces these)
# to get a valid relative path
rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")

additional_metadata = self.sterilize_info(path=file_path)
additional_metadata = self.sterilize_info(file_data=file_data)
additional_metadata["original_file_path"] = file_path
yield FileData(
identifier=str(uuid5(NAMESPACE_DNS, file_path)),
Expand All @@ -155,7 +157,7 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
rel_path=rel_path or None,
fullpath=file_path,
),
metadata=self.get_metadata(path=file_path),
metadata=self.get_metadata(file_data=file_data),
additional_metadata=additional_metadata,
)

Expand Down
3 changes: 3 additions & 0 deletions unstructured_ingest/v2/processes/connectors/fsspec/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ class S3Indexer(FsspecIndexer):
index_config: S3IndexerConfig
connector_type: str = CONNECTOR_TYPE

def get_path(self, file_data: dict) -> str:
return file_data["Key"]

def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
path = file_data["Key"]
date_created = None
Expand Down

0 comments on commit 6cc4765

Please sign in to comment.