Merge pull request #1221 from aodn/oceancurrent-file-api

add ocean current file structure API and test
aodn · Jan 14, 2025 · cbc51c9 · cbc51c9
2 parents 2c42648 + 268ca5b
commit cbc51c9
Show file tree

Hide file tree

Showing 12 changed files with 358 additions and 0 deletions.
diff --git a/ARGO/oceancurrent/config.ini b/ARGO/oceancurrent/config.ini
@@ -0,0 +1,7 @@
+[fourHourSst]
+rootpath = SST_4hr
+subproduct = SST_Filled,SST,SST_Age,Wind
+
+[sixDaySst]
+rootpath = DR_SST_daily,STATE_daily
+subproduct = SST,SST_ANOM,pctiles
diff --git a/ARGO/oceancurrent/oceancurrent_file_server_api.py b/ARGO/oceancurrent/oceancurrent_file_server_api.py
@@ -0,0 +1,267 @@
+import configparser
+import pandas as pd
+import json
+import tempfile
+import os
+from typing import List
+from pathlib import Path
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class Files:
+    """
+        Files class to store the file information. A file has two attributes: name and path. Both in string format.
+        A file object can be converted to json format through the `to_json` method.
+    """
+    def __init__(self, name: str, path:str) -> None:
+        self.name = name
+        self.path = path
+
+    def to_json(self):
+        return {
+            "name": self.name,
+            "path": self.path
+        }
+
+class Product:
+    """
+        A Product class to store the product information. A product has four attributes: product, subProduct, region, path and files.
+        Attributes:
+            product: string, the product name.
+            subProduct: string, the subproduct name.
+            region: string, the region name.
+            files: List[Files], a list of Files objects.
+            path: string, the path of the product in the server.
+        A product object can be converted to json format through the `to_json` method. 
+        Attributes `region`, `files` and `path` can be set through the `set_region`, `set_files` and `set_path` methods.
+    """
+    def __init__(self, product: str, subProduct: str, region: str) -> None:
+        self.product = product
+        self.subProduct = subProduct
+        self.region = region
+        self.path = None
+        self.files = []
+
+    def set_region(self, region: str) -> None:
+        self.region = region
+
+    def set_files(self, files: List[Files]) -> None:
+        self.files = files
+
+    def set_path(self, path: str) -> None:
+        self.path = path
+
+    def to_json(self):
+        return {
+            "path": self.path,
+            "product": self.product,
+            "subProduct": self.subProduct,
+            "region": self.region,
+            "files": [f.to_json() for f in self.files]
+        }
+
+    def __eq__(self, other):
+        if not isinstance(other, Product):
+            return NotImplemented
+
+        return self.product == other.product and self.subProduct == other.subProduct and self.region == other.region
+
+class FileStructureAnalyser:
+    def __init__(self) -> None:
+        self.temp_dir = tempfile.mkdtemp()
+
+    def to_camel_case(self, text):
+        """
+            Convert a string to camel case format.
+            Input:
+                text: string, the input text.
+            Output:
+                string, the camel case format of the input text.
+        """
+        words = text.replace("_", " ").split()
+        return words[0].lower() + ''.join(word.capitalize() for word in words[1:])
+
+    def load_config(self):
+        """
+            Load the config file `config.ini` and get the watched products and subproducts.
+        """
+        config = configparser.ConfigParser()
+        config_file_path = Path.cwd() / "ARGO" / "oceancurrent" / "config.ini"
+        config.read(config_file_path)
+
+        # load watched product root paths
+        watchedProduct = []
+        watchedSubProduct = []
+        productMap = {}
+        for section in config.sections():
+            subproducts = config.get(section, "subproduct")
+            rootpaths = config.get(section, "rootpath")
+            if "," in rootpaths:
+                rootpath = rootpaths.split(",")
+                for r in rootpath:
+                    watchedProduct.append(r)
+                    productMap[r] = section
+            else:
+                watchedProduct.append(rootpaths)
+                productMap[rootpaths] = section
+            if "," in subproducts:
+                subproduct = subproducts.split(",")
+                for sub in subproduct:
+                    watchedSubProduct.append(sub)
+            else:
+                watchedSubProduct.append(subproducts)
+        return watchedProduct, watchedSubProduct, productMap
+
+    def data_preprocess(self, file_structure, watchedProduct, watchedSubProduct):
+        """
+            Preprocess the data from the file structure file. The function will read the file structure file and filter the data based on the watched products and subproducts which is defined in the config file `config.ini`.
+            Input:
+                file_structure: pd.DataFrame, the file structure data.
+                watchedProduct: List[str], the list of watched products.
+                watchedSubProduct: List[str], the list of watched subproducts.
+            Output:
+                file_structure: pd.DataFrame, the filtered file structure data.
+                productMap: dict, a dictionary to map the root path to the product name.
+        """
+        temp_file_structure = file_structure.copy()
+        temp_file_structure["full_path"] = file_structure["default_path"].str[1:]
+        temp_file_structure.drop(columns=["default_path"], inplace=True)
+        temp_file_structure = temp_file_structure[temp_file_structure['full_path'].str.endswith('.gif')]
+        temp_file_structure["paths"] = temp_file_structure["full_path"].str.split("/")
+        temp_file_structure.loc[:, "paths"] = temp_file_structure["paths"].apply(lambda x: [item for item in x if item != ''])
+        temp_file_structure.loc[:, "product"] = temp_file_structure["paths"].apply(lambda x: x[0])
+        temp_file_structure.loc[:, "file_name"] = temp_file_structure["paths"].apply(lambda x: x[-1])
+        temp_file_structure = temp_file_structure[temp_file_structure['product'].isin(watchedProduct)]
+        temp_file_structure.loc[:, "subProduct"] = temp_file_structure["paths"].apply(lambda x: x[1])
+        temp_file_structure = temp_file_structure[temp_file_structure['subProduct'].isin(watchedSubProduct)]
+        return temp_file_structure
+
+    def group_data_formatter(self, data):
+        """
+            Group the data by the region and format the data to the required format.
+            Input:
+                data: pd.DataFrame, the data to be formatted.
+            Output:
+                List[dict], the formatted data in the required, which is the list of files grouped by the region.
+        """
+        grouped = data.groupby("region")
+        grouped_data = []
+        for group_name, group_df in grouped:
+            product = Product(product=group_df.iloc[0]["product"], 
+                            subProduct=self.to_camel_case(group_df.iloc[0]["subProduct"]),
+                            region=group_name)
+            product.set_path(path="/" + group_df.iloc[0]["folder_path"] + "/" + group_df.iloc[0]["region"])
+            product_files = []
+            for row in group_df.itertuples(index=False):
+                file_name = row.file_name
+                file_path = row.full_path
+                f = Files(name=file_name, path=file_path)
+                product_files.append(f)
+            product.set_files(product_files)
+            grouped_data.append(product.to_json())
+        return grouped_data
+
+    def data_formatter(self, ds, productMap):
+        """
+            Format the data to the required format. The data will be grouped by the product and subproduct, then save the data to a json file, which is named by the subproduct under the product folder.
+            Input:
+                ds: pd.DataFrame, the data to be formatted.
+                productMap: dict, a dictionary to map the root path to the product name.
+        """
+        formatted_data = ds.copy()
+        formatted_data["region"] = ds["paths"].apply(lambda x: x[2])
+        formatted_data["folder_path"] = ds.apply(lambda row: '/'.join([row['product'], row['subProduct']]), axis=1)
+
+        grouped = formatted_data.groupby("folder_path")
+        for group_name, group_df in grouped:
+            group_df["product"] = group_df["product"].apply(lambda x: productMap.get(x))
+
+            grouped = self.group_data_formatter(group_df)
+
+            # get current folder path
+            current_folder = self.base_path
+            filePath = os.path.join(current_folder, f"{group_name}.json")            
+
+            directory = os.path.dirname(filePath)
+            if not os.path.exists(directory):
+                os.makedirs(directory)
+
+            with open(filePath, 'w') as json_file:
+                json.dump(grouped, json_file, indent=2)
+
+class FileStructureExplorer(FileStructureAnalyser):
+    def __init__(self):
+        # extend the FileStructureAnalyser class
+        super().__init__()
+
+        # define the base path and products (list of strings) for the explorer
+        self.watchedProducts = self.get_watched_products()[0]
+        self.watchedSubProducts = self.get_watched_products()[1]
+        self.productMap = self.get_watched_products()[2]
+
+        self.base_path = None
+
+    def set_base_path(self, base_path: Path):
+        """
+            Set the base directory folder path for exploring the file structure.
+            Input:
+                base_path: Path, the base directory folder path.
+        """
+        self.base_path = base_path
+
+    def get_watched_products(self) -> List[str]:
+        """
+            This method reads the configuration file and returns the list of watched products.
+        """
+        return self.load_config()
+
+
+    def list_products(self) -> pd.DataFrame:
+        """
+            This method go through the base path and list the files in the products, with a walk through the subproducts.
+            Output:
+                pd.DataFrame, the file structure data which has the same structure from https://oceancurrent.aodn.org.au/OC_files.txt, 
+                which has two columns: file_size and default_path.
+        """
+        products = []
+        # find watched products in current directory
+        for product in os.listdir(self.base_path):
+            product_path = Path(os.path.join(self.base_path, product))
+            if product in self.watchedProducts and product_path.is_dir():
+                # list sub products in the watched product folder
+                for sub_product in os.listdir(product_path):
+                    sub_product_path = Path(os.path.join(product_path, sub_product))
+                    if sub_product in self.watchedSubProducts and sub_product_path.is_dir():
+                        # list files in the sub product folder
+                        for root, _, files in os.walk(sub_product_path):
+                            for file in files:
+                                # keep only files with '.gif' extension
+                                if file.endswith(".gif"):
+                                    file_size = os.path.getsize(os.path.join(root, file))
+                                    # reformat the file path to be relative to the base path
+                                    file_path = os.path.join(root, file).replace(self.base_path, ".")
+                                    file_path = file_path.replace("\\", "/")
+                                    products.append({"file_size": file_size, "default_path": file_path})
+        # convert the list to dataframe
+        productDF = pd.DataFrame(products)
+        return productDF
+
+    def pipeline(self, base_path: str):
+        """
+            The pipeline to explore the file structure and save the data to the required format.
+            Input:
+                base_path: str, the base directory folder path.
+        """
+        self.set_base_path(base_path)
+        list_products = self.list_products()
+        # analyse file structure to JSON response
+        raw_data = self.data_preprocess(list_products, self.watchedProducts, self.watchedSubProducts)
+        self.data_formatter(raw_data, self.productMap)
+
+if __name__ == '__main__':
+    file_structure = FileStructureExplorer()
+    file_structure.pipeline(os.path.join(os.path.dirname(__file__), 'tests'))
+    logger.info("File structure exploration completed.")
diff --git a/ARGO/oceancurrent/test_oceancurrent_file_server_api.py b/ARGO/oceancurrent/test_oceancurrent_file_server_api.py
@@ -0,0 +1,84 @@
+import os
+import json
+import tempfile
+import shutil
+import unittest
+
+from oceancurrent_file_server_api import FileStructureExplorer
+
+class TestFileServerAPI(unittest.TestCase):
+
+    def setUp(self) -> None:
+        # Create a temporary directory
+        self.test_dir = tempfile.mkdtemp()
+
+        # Path to the existing test files
+        self.existing_test_files_path = os.path.join(os.path.dirname(__file__), 'tests')
+
+        # Copy all test files to the temporary directory
+        for item in os.listdir(self.existing_test_files_path):
+            s = os.path.join(self.existing_test_files_path, item)
+            d = os.path.join(self.test_dir, item)
+            if os.path.isdir(s):
+                shutil.copytree(s, d, False, None)
+            else:
+                shutil.copy2(s, d)
+
+    def test_file_structure_explorer(self):
+        file_structure = FileStructureExplorer()
+
+        # generate json files through the pipeline method
+        file_structure.pipeline(self.existing_test_files_path)
+
+        # Verify the generated folders as watched products
+        self.assertEqual(file_structure.watchedProducts, ['SST_4hr', 'DR_SST_daily', 'STATE_daily'])
+
+        # Verify the generated json files for a watched product
+        self.assertTrue(os.path.exists(os.path.join(self.existing_test_files_path, 'SST_4hr', 'SST.json')))
+
+        # Verify the content of a generated json file
+        generated_json_path = os.path.join(self.existing_test_files_path, "DR_SST_daily", "SST.json")
+        with open(generated_json_path, 'r') as f:
+                generated_json = json.load(f)
+        expected_json = [
+            {
+                "path": "/DR_SST_daily/SST/AlbEsp",
+                "product": "sixDaySst",
+                "subProduct": "sst",
+                "region": "AlbEsp",
+                "files": [
+                        {
+                            "name": "20190801.gif",
+                            "path": "/DR_SST_daily/SST/AlbEsp/20190801.gif"
+                        }
+            ]},
+            {
+                "path": "/DR_SST_daily/SST/Indo",
+                "product": "sixDaySst",
+                "subProduct": "sst",
+                "region": "Indo",
+                "files": [
+                        {
+                            "name": "20210213.gif",
+                            "path": "/DR_SST_daily/SST/Indo/20210213.gif"
+                        }
+            ]},
+            {
+                "path": "/DR_SST_daily/SST/TimorP",
+                "product": "sixDaySst",
+                "subProduct": "sst",
+                "region": "TimorP",
+                "files": [
+                        {
+                            "name": "20201219.gif",
+                            "path": "/DR_SST_daily/SST/TimorP/20201219.gif"
+                        }
+            ]},
+        ]
+
+        self.assertEqual(generated_json, expected_json, f"The generated SST.json content in DR_SST_daily is incorrect")
+
+if __name__ == '__main__':
+    unittest.main()
+
+
diff --git a/ARGO/oceancurrent/tests/DR_SST_daily/SST/AlbEsp/20190801.gif b/ARGO/oceancurrent/tests/DR_SST_daily/SST/AlbEsp/20190801.gif
diff --git a/ARGO/oceancurrent/tests/DR_SST_daily/SST/Indo/20210213.gif b/ARGO/oceancurrent/tests/DR_SST_daily/SST/Indo/20210213.gif
diff --git a/ARGO/oceancurrent/tests/DR_SST_daily/SST/TimorP/20201219.gif b/ARGO/oceancurrent/tests/DR_SST_daily/SST/TimorP/20201219.gif
diff --git a/ARGO/oceancurrent/tests/SST_4hr/SST/Adelaide/2024041918.gif b/ARGO/oceancurrent/tests/SST_4hr/SST/Adelaide/2024041918.gif
diff --git a/ARGO/oceancurrent/tests/SST_4hr/SST/SAgulfs/2024051010.gif b/ARGO/oceancurrent/tests/SST_4hr/SST/SAgulfs/2024051010.gif
diff --git a/ARGO/oceancurrent/tests/SST_4hr/Wind/Perth/2019100900.gif b/ARGO/oceancurrent/tests/SST_4hr/Wind/Perth/2019100900.gif
diff --git a/ARGO/oceancurrent/tests/STATE_daily/SST/GAB/20210921.gif b/ARGO/oceancurrent/tests/STATE_daily/SST/GAB/20210921.gif
diff --git a/ARGO/oceancurrent/tests/STATE_daily/SST_ANOM/NZ/20160407.gif b/ARGO/oceancurrent/tests/STATE_daily/SST_ANOM/NZ/20160407.gif
diff --git a/ARGO/oceancurrent/tests/STATE_daily/SST_ANOM/SO/20241003.gif b/ARGO/oceancurrent/tests/STATE_daily/SST_ANOM/SO/20241003.gif