Skip to content

Commit

Permalink
Merge pull request #1221 from aodn/oceancurrent-file-api
Browse files Browse the repository at this point in the history
add ocean current file structure API and test
  • Loading branch information
weited authored Jan 14, 2025
2 parents 2c42648 + 268ca5b commit cbc51c9
Show file tree
Hide file tree
Showing 12 changed files with 358 additions and 0 deletions.
7 changes: 7 additions & 0 deletions ARGO/oceancurrent/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[fourHourSst]
rootpath = SST_4hr
subproduct = SST_Filled,SST,SST_Age,Wind

[sixDaySst]
rootpath = DR_SST_daily,STATE_daily
subproduct = SST,SST_ANOM,pctiles
267 changes: 267 additions & 0 deletions ARGO/oceancurrent/oceancurrent_file_server_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
import configparser
import pandas as pd
import json
import tempfile
import os
from typing import List
from pathlib import Path
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class Files:
"""
Files class to store the file information. A file has two attributes: name and path. Both in string format.
A file object can be converted to json format through the `to_json` method.
"""
def __init__(self, name: str, path:str) -> None:
self.name = name
self.path = path

def to_json(self):
return {
"name": self.name,
"path": self.path
}

class Product:
"""
A Product class to store the product information. A product has four attributes: product, subProduct, region, path and files.
Attributes:
product: string, the product name.
subProduct: string, the subproduct name.
region: string, the region name.
files: List[Files], a list of Files objects.
path: string, the path of the product in the server.
A product object can be converted to json format through the `to_json` method.
Attributes `region`, `files` and `path` can be set through the `set_region`, `set_files` and `set_path` methods.
"""
def __init__(self, product: str, subProduct: str, region: str) -> None:
self.product = product
self.subProduct = subProduct
self.region = region
self.path = None
self.files = []

def set_region(self, region: str) -> None:
self.region = region

def set_files(self, files: List[Files]) -> None:
self.files = files

def set_path(self, path: str) -> None:
self.path = path

def to_json(self):
return {
"path": self.path,
"product": self.product,
"subProduct": self.subProduct,
"region": self.region,
"files": [f.to_json() for f in self.files]
}

def __eq__(self, other):
if not isinstance(other, Product):
return NotImplemented

return self.product == other.product and self.subProduct == other.subProduct and self.region == other.region

class FileStructureAnalyser:
def __init__(self) -> None:
self.temp_dir = tempfile.mkdtemp()

def to_camel_case(self, text):
"""
Convert a string to camel case format.
Input:
text: string, the input text.
Output:
string, the camel case format of the input text.
"""
words = text.replace("_", " ").split()
return words[0].lower() + ''.join(word.capitalize() for word in words[1:])

def load_config(self):
"""
Load the config file `config.ini` and get the watched products and subproducts.
"""
config = configparser.ConfigParser()
config_file_path = Path.cwd() / "ARGO" / "oceancurrent" / "config.ini"
config.read(config_file_path)

# load watched product root paths
watchedProduct = []
watchedSubProduct = []
productMap = {}
for section in config.sections():
subproducts = config.get(section, "subproduct")
rootpaths = config.get(section, "rootpath")
if "," in rootpaths:
rootpath = rootpaths.split(",")
for r in rootpath:
watchedProduct.append(r)
productMap[r] = section
else:
watchedProduct.append(rootpaths)
productMap[rootpaths] = section
if "," in subproducts:
subproduct = subproducts.split(",")
for sub in subproduct:
watchedSubProduct.append(sub)
else:
watchedSubProduct.append(subproducts)
return watchedProduct, watchedSubProduct, productMap

def data_preprocess(self, file_structure, watchedProduct, watchedSubProduct):
"""
Preprocess the data from the file structure file. The function will read the file structure file and filter the data based on the watched products and subproducts which is defined in the config file `config.ini`.
Input:
file_structure: pd.DataFrame, the file structure data.
watchedProduct: List[str], the list of watched products.
watchedSubProduct: List[str], the list of watched subproducts.
Output:
file_structure: pd.DataFrame, the filtered file structure data.
productMap: dict, a dictionary to map the root path to the product name.
"""
temp_file_structure = file_structure.copy()
temp_file_structure["full_path"] = file_structure["default_path"].str[1:]
temp_file_structure.drop(columns=["default_path"], inplace=True)
temp_file_structure = temp_file_structure[temp_file_structure['full_path'].str.endswith('.gif')]
temp_file_structure["paths"] = temp_file_structure["full_path"].str.split("/")
temp_file_structure.loc[:, "paths"] = temp_file_structure["paths"].apply(lambda x: [item for item in x if item != ''])
temp_file_structure.loc[:, "product"] = temp_file_structure["paths"].apply(lambda x: x[0])
temp_file_structure.loc[:, "file_name"] = temp_file_structure["paths"].apply(lambda x: x[-1])
temp_file_structure = temp_file_structure[temp_file_structure['product'].isin(watchedProduct)]
temp_file_structure.loc[:, "subProduct"] = temp_file_structure["paths"].apply(lambda x: x[1])
temp_file_structure = temp_file_structure[temp_file_structure['subProduct'].isin(watchedSubProduct)]
return temp_file_structure

def group_data_formatter(self, data):
"""
Group the data by the region and format the data to the required format.
Input:
data: pd.DataFrame, the data to be formatted.
Output:
List[dict], the formatted data in the required, which is the list of files grouped by the region.
"""
grouped = data.groupby("region")
grouped_data = []
for group_name, group_df in grouped:
product = Product(product=group_df.iloc[0]["product"],
subProduct=self.to_camel_case(group_df.iloc[0]["subProduct"]),
region=group_name)
product.set_path(path="/" + group_df.iloc[0]["folder_path"] + "/" + group_df.iloc[0]["region"])
product_files = []
for row in group_df.itertuples(index=False):
file_name = row.file_name
file_path = row.full_path
f = Files(name=file_name, path=file_path)
product_files.append(f)
product.set_files(product_files)
grouped_data.append(product.to_json())
return grouped_data

def data_formatter(self, ds, productMap):
"""
Format the data to the required format. The data will be grouped by the product and subproduct, then save the data to a json file, which is named by the subproduct under the product folder.
Input:
ds: pd.DataFrame, the data to be formatted.
productMap: dict, a dictionary to map the root path to the product name.
"""
formatted_data = ds.copy()
formatted_data["region"] = ds["paths"].apply(lambda x: x[2])
formatted_data["folder_path"] = ds.apply(lambda row: '/'.join([row['product'], row['subProduct']]), axis=1)

grouped = formatted_data.groupby("folder_path")
for group_name, group_df in grouped:
group_df["product"] = group_df["product"].apply(lambda x: productMap.get(x))

grouped = self.group_data_formatter(group_df)

# get current folder path
current_folder = self.base_path
filePath = os.path.join(current_folder, f"{group_name}.json")

directory = os.path.dirname(filePath)
if not os.path.exists(directory):
os.makedirs(directory)

with open(filePath, 'w') as json_file:
json.dump(grouped, json_file, indent=2)

class FileStructureExplorer(FileStructureAnalyser):
def __init__(self):
# extend the FileStructureAnalyser class
super().__init__()

# define the base path and products (list of strings) for the explorer
self.watchedProducts = self.get_watched_products()[0]
self.watchedSubProducts = self.get_watched_products()[1]
self.productMap = self.get_watched_products()[2]

self.base_path = None

def set_base_path(self, base_path: Path):
"""
Set the base directory folder path for exploring the file structure.
Input:
base_path: Path, the base directory folder path.
"""
self.base_path = base_path

def get_watched_products(self) -> List[str]:
"""
This method reads the configuration file and returns the list of watched products.
"""
return self.load_config()


def list_products(self) -> pd.DataFrame:
"""
This method go through the base path and list the files in the products, with a walk through the subproducts.
Output:
pd.DataFrame, the file structure data which has the same structure from https://oceancurrent.aodn.org.au/OC_files.txt,
which has two columns: file_size and default_path.
"""
products = []
# find watched products in current directory
for product in os.listdir(self.base_path):
product_path = Path(os.path.join(self.base_path, product))
if product in self.watchedProducts and product_path.is_dir():
# list sub products in the watched product folder
for sub_product in os.listdir(product_path):
sub_product_path = Path(os.path.join(product_path, sub_product))
if sub_product in self.watchedSubProducts and sub_product_path.is_dir():
# list files in the sub product folder
for root, _, files in os.walk(sub_product_path):
for file in files:
# keep only files with '.gif' extension
if file.endswith(".gif"):
file_size = os.path.getsize(os.path.join(root, file))
# reformat the file path to be relative to the base path
file_path = os.path.join(root, file).replace(self.base_path, ".")
file_path = file_path.replace("\\", "/")
products.append({"file_size": file_size, "default_path": file_path})
# convert the list to dataframe
productDF = pd.DataFrame(products)
return productDF

def pipeline(self, base_path: str):
"""
The pipeline to explore the file structure and save the data to the required format.
Input:
base_path: str, the base directory folder path.
"""
self.set_base_path(base_path)
list_products = self.list_products()
# analyse file structure to JSON response
raw_data = self.data_preprocess(list_products, self.watchedProducts, self.watchedSubProducts)
self.data_formatter(raw_data, self.productMap)

if __name__ == '__main__':
file_structure = FileStructureExplorer()
file_structure.pipeline(os.path.join(os.path.dirname(__file__), 'tests'))
logger.info("File structure exploration completed.")
84 changes: 84 additions & 0 deletions ARGO/oceancurrent/test_oceancurrent_file_server_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import json
import tempfile
import shutil
import unittest

from oceancurrent_file_server_api import FileStructureExplorer

class TestFileServerAPI(unittest.TestCase):

def setUp(self) -> None:
# Create a temporary directory
self.test_dir = tempfile.mkdtemp()

# Path to the existing test files
self.existing_test_files_path = os.path.join(os.path.dirname(__file__), 'tests')

# Copy all test files to the temporary directory
for item in os.listdir(self.existing_test_files_path):
s = os.path.join(self.existing_test_files_path, item)
d = os.path.join(self.test_dir, item)
if os.path.isdir(s):
shutil.copytree(s, d, False, None)
else:
shutil.copy2(s, d)

def test_file_structure_explorer(self):
file_structure = FileStructureExplorer()

# generate json files through the pipeline method
file_structure.pipeline(self.existing_test_files_path)

# Verify the generated folders as watched products
self.assertEqual(file_structure.watchedProducts, ['SST_4hr', 'DR_SST_daily', 'STATE_daily'])

# Verify the generated json files for a watched product
self.assertTrue(os.path.exists(os.path.join(self.existing_test_files_path, 'SST_4hr', 'SST.json')))

# Verify the content of a generated json file
generated_json_path = os.path.join(self.existing_test_files_path, "DR_SST_daily", "SST.json")
with open(generated_json_path, 'r') as f:
generated_json = json.load(f)
expected_json = [
{
"path": "/DR_SST_daily/SST/AlbEsp",
"product": "sixDaySst",
"subProduct": "sst",
"region": "AlbEsp",
"files": [
{
"name": "20190801.gif",
"path": "/DR_SST_daily/SST/AlbEsp/20190801.gif"
}
]},
{
"path": "/DR_SST_daily/SST/Indo",
"product": "sixDaySst",
"subProduct": "sst",
"region": "Indo",
"files": [
{
"name": "20210213.gif",
"path": "/DR_SST_daily/SST/Indo/20210213.gif"
}
]},
{
"path": "/DR_SST_daily/SST/TimorP",
"product": "sixDaySst",
"subProduct": "sst",
"region": "TimorP",
"files": [
{
"name": "20201219.gif",
"path": "/DR_SST_daily/SST/TimorP/20201219.gif"
}
]},
]

self.assertEqual(generated_json, expected_json, f"The generated SST.json content in DR_SST_daily is incorrect")

if __name__ == '__main__':
unittest.main()


Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit cbc51c9

Please sign in to comment.