From 33eef29a808f4c93e834c19a33ffb34bebc8384a Mon Sep 17 00:00:00 2001
From: GCR-1178 <jiaweichen.china@gmail.com>
Date: Wed, 3 Jul 2024 09:35:05 +0000
Subject: [PATCH 1/2] add cycler [ARBIN, NEWARE] and dump single file

---
 README.md                                 |  13 ++
 batteryml/preprocess/__init__.py          |  12 +-
 batteryml/preprocess/base.py              |  42 +++-
 batteryml/preprocess/preprocess_CALCE.py  |  23 +-
 batteryml/preprocess/preprocess_HNEI.py   |  21 +-
 batteryml/preprocess/preprocess_HUST.py   |  28 ++-
 batteryml/preprocess/preprocess_MATR.py   |  39 ++--
 batteryml/preprocess/preprocess_OX.py     |  22 +-
 batteryml/preprocess/preprocess_RWTH.py   |  28 ++-
 batteryml/preprocess/preprocess_SNL.py    |  24 +-
 batteryml/preprocess/preprocess_UL_PUR.py |  24 +-
 batteryml/preprocess/preprocess_arbin.py  | 226 +++++++++++++++++++
 batteryml/preprocess/preprocess_neware.py | 262 ++++++++++++++++++++++
 batteryml/utils/config.py                 |  16 +-
 bin/batteryml.py                          |  19 +-
 configs/cyclers/arbin.yaml                |  37 +++
 configs/cyclers/arbin_metadata.yaml       |  42 ++++
 configs/cyclers/neware.yaml               |  42 ++++
 18 files changed, 848 insertions(+), 72 deletions(-)
 create mode 100644 batteryml/preprocess/preprocess_arbin.py
 create mode 100644 batteryml/preprocess/preprocess_neware.py
 create mode 100644 configs/cyclers/arbin.yaml
 create mode 100644 configs/cyclers/arbin_metadata.yaml
 create mode 100644 configs/cyclers/neware.yaml

diff --git a/README.md b/README.md
index 8d4c796..8687a75 100644
--- a/README.md
+++ b/README.md
@@ -96,6 +96,19 @@ batteryml download MATR /path/to/save/raw/data
 batteryml preprocess MATR /path/to/save/raw/data /path/to/save/processed/data
 ```
 
+### Run Cycler Preprocessing Scripts to process your data
+If your data is measured by a cycler such as ARBIN, NEWARE, etc., you can use this command to process your data into `BatteryData` of BatteryML.
+
+```bash
+batteryml preprocess ARBIN /path/to/save/raw/data /path/to/save/processed/data --config /path/to/config/yaml/file
+```
+
+Due to variations in software versions and configurations, the data format and fields exported by the same cycler may differ. Therefore, we have added default processing configurations in the `/configs/cycler` directory to map raw data to target data fields. You can edit these default configurations as needed.
+
+We currently support `ARBIN` and `NEWARE` data formats. Additionally, `Biologic`, `LANDT`, and `Indigo` formats are being integrated.  If you encounter any issues with our cycler processing your data, please submit an issue and attach a sample data file to help us ensure rapid compatibility with your data format.
+
+
+
 ### Run training and/or inference tasks using config files
 
 BatteryML supports using a simple config file to specify the training and inference process. We provided several examples in `configs`. For example, to reproduce the "variance" model for battery life prediction, run
diff --git a/batteryml/preprocess/__init__.py b/batteryml/preprocess/__init__.py
index 1447f86..2b78577 100644
--- a/batteryml/preprocess/__init__.py
+++ b/batteryml/preprocess/__init__.py
@@ -1,3 +1,4 @@
+import logging
 from .download import DOWNLOAD_LINKS, download_file
 from .preprocess_CALCE import CALCEPreprocessor
 from .preprocess_HNEI import HNEIPreprocessor
@@ -7,8 +8,13 @@
 from .preprocess_RWTH import RWTHPreprocessor
 from .preprocess_SNL import SNLPreprocessor
 from .preprocess_UL_PUR import UL_PURPreprocessor
+from .preprocess_arbin import ARBINPreprocessor
+from .preprocess_neware import NEWAREPreprocessor
 
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
 
-SUPPORTED_SOURCES = [
-    'CALCE', 'HNEI', 'HUST', 'MATR', 'OX', 'RWTH', 'SNL', 'UL_PUR'
-]
\ No newline at end of file
+SUPPORTED_SOURCES = {
+    'DATASETS': ['CALCE', 'HNEI', 'HUST', 'MATR', 'OX', 'RWTH', 'SNL', 'UL_PUR'],
+    'CYCLERS': ['ARBIN', 'BATTERYARCHIVE', "BIOLOGIC",  'INDIGO',  "LANDT", "MACCOR", 'NEWARE', 'NOVONIX']
+}
diff --git a/batteryml/preprocess/base.py b/batteryml/preprocess/base.py
index f6956cc..3f52c41 100644
--- a/batteryml/preprocess/base.py
+++ b/batteryml/preprocess/base.py
@@ -1,6 +1,8 @@
 # Licensed under the MIT License.
 # Copyright (c) Microsoft Corporation.
 
+import os
+import logging
 from tqdm import tqdm
 from typing import List
 from pathlib import Path
@@ -14,22 +16,38 @@ def __init__(self,
         self.silent = silent
         self.output_dir = Path(output_dir)
 
-    def process(self, parentdir: str) -> List[BatteryData]:
+    def process(self, *args, **kwargs) -> List[BatteryData]:
         """Main logic for preprocessing data."""
 
-    def __call__(self, parentdir: str):
-        batteries = self.process(parentdir)
-        self.dump(batteries)
+    def __call__(self, *args, **kwargs):
+        process_batteries_num, skip_batteries_num = self.process(
+            *args, **kwargs)
         if not self.silent:
-            self.summary(batteries)
+            print(f'Successfully processed {process_batteries_num} batteries.')
+            print(f'Skip processing {skip_batteries_num} batteries.')
 
-    def dump(self, batteries: List[BatteryData]):
-        if not self.silent:
-            batteries = tqdm(
-                batteries,
-                desc=f'Dump batteries to {str(self.output_dir)}')
-        for battery in batteries:
-            battery.dump(self.output_dir / f'{battery.cell_id}.pkl')
+    def check_processed_file(self, processed_file: str):
+        expected_pkl_path = os.path.join(
+            self.output_dir, (f"{processed_file}.pkl"))
+        if os.path.exists(expected_pkl_path) and os.path.getsize(expected_pkl_path) > 0:
+            logging.info(
+                f'Skip processing {processed_file}, pkl file already exists and is not empty.')
+            return True
+        elif os.path.exists(expected_pkl_path) and os.path.getsize(expected_pkl_path) == 0:
+            logging.info(
+                f'Found empty pkl file for {processed_file}.')
+        return False
+
+    # def dump(self, batteries: List[BatteryData]):
+    #     if not self.silent:
+    #         batteries = tqdm(
+    #             batteries,
+    #             desc=f'Dump batteries to {str(self.output_dir)}')
+    #     for battery in batteries:
+    #         battery.dump(self.output_dir / f'{battery.cell_id}.pkl')
+
+    def dump_single_file(self, battery: BatteryData):
+        battery.dump(self.output_dir / f'{battery.cell_id}.pkl')
 
     def summary(self, batteries: List[BatteryData]):
         print(f'Successfully processed {len(batteries)} batteries.')
diff --git a/batteryml/preprocess/preprocess_CALCE.py b/batteryml/preprocess/preprocess_CALCE.py
index 6ada30d..70e70e6 100644
--- a/batteryml/preprocess/preprocess_CALCE.py
+++ b/batteryml/preprocess/preprocess_CALCE.py
@@ -21,14 +21,22 @@
 
 @PREPROCESSORS.register()
 class CALCEPreprocessor(BasePreprocessor):
-    def process(self, parentdir) -> List[BatteryData]:
+    def process(self, parentdir, **kwargs) -> List[BatteryData]:
         path = Path(parentdir)
         raw_files = [Path(f) for f in path.glob('*.zip')]
         cells = [f.stem for f in raw_files]
         if not self.silent:
             cells = tqdm(cells)
-        batteries = []
+
+        process_batteries_num = 0
+        skip_batteries_num = 0
         for cell, raw_file in zip(cells, raw_files):
+            # judge whether to skip the processed file
+            whether_to_skip = self.check_processed_file(f'CALCE_{cell}')
+            if whether_to_skip == True:
+                skip_batteries_num += 1
+                continue
+
             rawdatadir = raw_file.parent / cell
             if not rawdatadir.exists():
                 if not self.silent:
@@ -95,7 +103,7 @@ def process(self, parentdir) -> List[BatteryData]:
             if 'CX2_16' == cell.upper():
                 clean_cycles = clean_cycles[1:]
 
-            batteries.append(BatteryData(
+            battery = BatteryData(
                 cell_id=f'CALCE_{cell}',
                 form_factor='prismatic',
                 anode_material='graphite',
@@ -104,12 +112,17 @@ def process(self, parentdir) -> List[BatteryData]:
                 nominal_capacity_in_Ah=C,
                 max_voltage_limit_in_V=4.2,
                 min_voltage_limit_in_V=2.7
-            ))
+            )
+            self.dump_single_file(battery)
+            process_batteries_num += 1
+
+            if not self.silent:
+                tqdm.write(f'File: {battery.cell_id} dumped to pkl file')
 
             # Remove the inflated directory
             shutil.rmtree(rawdatadir)
 
-        return batteries
+        return process_batteries_num, skip_batteries_num
 
 
 @njit
diff --git a/batteryml/preprocess/preprocess_HNEI.py b/batteryml/preprocess/preprocess_HNEI.py
index dbed1e2..06be062 100644
--- a/batteryml/preprocess/preprocess_HNEI.py
+++ b/batteryml/preprocess/preprocess_HNEI.py
@@ -15,14 +15,21 @@
 
 @PREPROCESSORS.register()
 class HNEIPreprocessor(BasePreprocessor):
-    def process(self, parent_dir) -> List[BatteryData]:
+    def process(self, parent_dir, **kwargs) -> List[BatteryData]:
         path = Path(parent_dir)
         cells = set(
             x.stem.split('_timeseries')[0]
             for x in path.glob('*HNEI*timeseries*'))
 
-        batteries = []
+        process_batteries_num = 0
+        skip_batteries_num = 0
         for cell in tqdm(cells, desc='Processing HNEI cells'):
+            # judge whether to skip the processed file
+            whether_to_skip = self.check_processed_file(cell)
+            if whether_to_skip == True:
+                skip_batteries_num += 1
+                continue
+
             timeseries_file = next(path.glob(f'*{cell}*timeseries*'))
             cycle_data_file = next(path.glob(f'*{cell}*cycle_data*'))
             timeseries_df = pd.read_csv(timeseries_file)
@@ -32,8 +39,14 @@ def process(self, parent_dir) -> List[BatteryData]:
             timeseries_df, _ = clean_cell(
                 timeseries_df, cycle_data_df, shifts=18)
             # Capacity is stated here: (https://www.mdpi.com/1996-1073/11/5/1031)
-            batteries.append(organize_cell(timeseries_df, cell, 2.8))
-        return batteries
+            battery = organize_cell(timeseries_df, cell, 2.8)
+            self.dump_single_file(battery)
+            process_batteries_num += 1
+
+            if not self.silent:
+                tqdm.write(f'File: {battery.cell_id} dumped to pkl file')
+
+        return process_batteries_num, skip_batteries_num
 
 
 def organize_cell(timeseries_df, name, C):
diff --git a/batteryml/preprocess/preprocess_HUST.py b/batteryml/preprocess/preprocess_HUST.py
index bfe9154..3c6176a 100644
--- a/batteryml/preprocess/preprocess_HUST.py
+++ b/batteryml/preprocess/preprocess_HUST.py
@@ -18,7 +18,7 @@
 
 @PREPROCESSORS.register()
 class HUSTPreprocessor(BasePreprocessor):
-    def process(self, parentdir) -> List[BatteryData]:
+    def process(self, parentdir, **kwargs) -> List[BatteryData]:
         raw_file = Path(parentdir) / 'hust_data.zip'
 
         with zipfile.ZipFile(raw_file, 'r') as zip_ref:
@@ -35,22 +35,31 @@ def process(self, parentdir) -> List[BatteryData]:
         if not self.silent:
             cell_files = tqdm(
                 cell_files, desc='Processing HUST cells')
-        batteries = []
+
+        process_batteries_num = 0
+        skip_batteries_num = 0
         for cell_file in cell_files:
             cell_id = cell_file.stem
             cell_name = f'HUST_{cell_id}'
+
+            # judge whether to skip the processed file
+            whether_to_skip = self.check_processed_file(cell_name)
+            if whether_to_skip == True:
+                skip_batteries_num += 1
+                continue
+
             with open(cell_file, 'rb') as fin:
                 cell_data = pickle.load(fin)[cell_id]['data']
             cycles = []
             for cycle in range(len(cell_data)):
-                df = cell_data[cycle+1]
+                df = cell_data[cycle + 1]
                 I = df['Current (mA)'].values / 1000.  # noqa
                 t = df['Time (s)'].values
                 V = df['Voltage (V)'].values
                 Qd = calc_Q(I, t, is_charge=False)
                 Qc = calc_Q(I, t, is_charge=True)
                 cycles.append(CycleData(
-                    cycle_number=cycle+1,
+                    cycle_number=cycle + 1,
                     voltage_in_V=V.tolist(),
                     current_in_A=I.tolist(),
                     time_in_s=t.tolist(),
@@ -62,7 +71,7 @@ def process(self, parentdir) -> List[BatteryData]:
             # Skip first problematic cycles
             if cell_name == 'HUST_7-5':
                 cycles = cycles[2:]
-            batteries.append(BatteryData(
+            battery = BatteryData(
                 cell_id=cell_name,
                 cycle_data=cycles,
                 form_factor='cylindrical_18650',
@@ -103,12 +112,17 @@ def process(self, parentdir) -> List[BatteryData]:
                 ],
                 min_voltage_limit_in_V=2.0,
                 max_voltage_limit_in_V=3.6
-            ))
+            )
+            self.dump_single_file(battery)
+            process_batteries_num += 1
+
+            if not self.silent:
+                tqdm.write(f'File: {battery.cell_id} dumped to pkl file')
 
         # Remove the inflated data
         shutil.rmtree(datadir)
 
-        return batteries
+        return process_batteries_num, skip_batteries_num
 
 
 # See https://www.rsc.org/suppdata/d2/ee/d2ee01676a/d2ee01676a1.pdf
diff --git a/batteryml/preprocess/preprocess_MATR.py b/batteryml/preprocess/preprocess_MATR.py
index bb094a8..d386c15 100644
--- a/batteryml/preprocess/preprocess_MATR.py
+++ b/batteryml/preprocess/preprocess_MATR.py
@@ -15,17 +15,18 @@
 
 @PREPROCESSORS.register()
 class MATRPreprocessor(BasePreprocessor):
-    def process(self, parentdir) -> List[BatteryData]:
+    def process(self, parentdir, **kwargs) -> List[BatteryData]:
         raw_files = [
-            parentdir /  'MATR_batch_20170512.mat',
-            parentdir /  'MATR_batch_20170630.mat',
-            parentdir /  'MATR_batch_20180412.mat',
-            parentdir /  'MATR_batch_20190124.mat',
+            parentdir / 'MATR_batch_20170512.mat',
+            parentdir / 'MATR_batch_20170630.mat',
+            parentdir / 'MATR_batch_20180412.mat',
+            parentdir / 'MATR_batch_20190124.mat',
         ]
 
         data_batches = []
         if not self.silent:
             raw_files = tqdm(raw_files)
+
         for indx, f in enumerate(raw_files):
             if hasattr(raw_files, 'set_description'):
                 raw_files.set_description(f'Loading {f.stem}')
@@ -35,7 +36,10 @@ def process(self, parentdir) -> List[BatteryData]:
 
             data_batches.append(load_batch(f, indx+1))
 
-        return clean_batches(data_batches)
+        batteries_num = clean_batches(
+            data_batches, self.dump_single_file, self.silent)
+
+        return batteries_num
 
 
 def load_batch(file, k):
@@ -100,13 +104,14 @@ def load_batch(file, k):
             'cycle_life': cl,
             'charge_policy': policy,
             'summary': summary,
-            'cycles': cycle_dict}
+            'cycles': cycle_dict
+        }
         key = f'b{k}c' + str(i)
         bat_dict[key] = cell_dict
     return bat_dict
 
 
-def clean_batches(data_batches):
+def clean_batches(data_batches, dump_single_file, silent):
     # remove batteries that do not reach 80% capacity
     # del data_batches[0]['b1c8']
     # del data_batches[0]['b1c10']
@@ -142,12 +147,18 @@ def clean_batches(data_batches):
             data_batches[0][bk]['cycles'][str(last_cycle + j)] = \
                 data_batches[1][batch2_keys[i]]['cycles'][jk]
 
-    cleaned = [
-        organize_cell(batch[cell], cell)
-        for batch in data_batches for cell in batch
-        if cell not in batch2_keys
-    ]
-    return cleaned
+    process_batteries_num = 0
+    skip_batteries_num = 0
+    for batch in data_batches:
+        for cell in batch:
+            if cell not in batch2_keys:
+                battery = organize_cell(batch[cell], cell)
+                dump_single_file(battery)
+                if not silent:
+                    tqdm.write(f'File: {battery.cell_id} dumped to pkl file')
+                process_batteries_num += 1
+
+    return process_batteries_num, skip_batteries_num
 
 
 def organize_cell(data, name):
diff --git a/batteryml/preprocess/preprocess_OX.py b/batteryml/preprocess/preprocess_OX.py
index da7d58b..fdbc319 100644
--- a/batteryml/preprocess/preprocess_OX.py
+++ b/batteryml/preprocess/preprocess_OX.py
@@ -14,21 +14,35 @@
 
 @PREPROCESSORS.register()
 class OXPreprocessor(BasePreprocessor):
-    def process(self, parentdir) -> List[BatteryData]:
+    def process(self, parentdir, **kwargs) -> List[BatteryData]:
         path = Path(parentdir)
         cells = set(
             x.stem.split('_timeseries')[0]
             for x in path.glob('*timeseries*'))
-        batteries = []
+
+        process_batteries_num = 0
+        skip_batteries_num = 0
         for cell in tqdm(cells, desc='Processing OX cells'):
+            # judge whether to skip the processed file
+            whether_to_skip = self.check_processed_file(cell)
+            if whether_to_skip == True:
+                skip_batteries_num += 1
+                continue
+
             timeseries_file = next(path.glob(f'*{cell}*timeseries*'))
             timeseries_df = pd.read_csv(timeseries_file)
             # Nominal capacity is 740mAh, which leads to too short
             # cycle life. No batteries reach 0.74Ah, so we use 0.72Ah
             # to calculate the cycle life.
             # https://ora.ox.ac.uk/objects/uuid:03ba4b01-cfed-46d3-9b1a-7d4a7bdf6fac
-            batteries.append(organize_cell(timeseries_df, cell, 0.72))
-        return batteries
+            battery = organize_cell(timeseries_df, cell, 0.72)
+            self.dump_single_file(battery)
+            process_batteries_num += 1
+
+            if not self.silent:
+                tqdm.write(f'File: {battery.cell_id} dumped to pkl file')
+
+        return process_batteries_num, skip_batteries_num
 
 
 def organize_cell(timeseries_df, name, C):
diff --git a/batteryml/preprocess/preprocess_RWTH.py b/batteryml/preprocess/preprocess_RWTH.py
index c654c79..21b5f72 100644
--- a/batteryml/preprocess/preprocess_RWTH.py
+++ b/batteryml/preprocess/preprocess_RWTH.py
@@ -18,7 +18,7 @@
 
 @PREPROCESSORS.register()
 class RWTHPreprocessor(BasePreprocessor):
-    def process(self, parentdir) -> List[BatteryData]:
+    def process(self, parentdir, **kwargs) -> List[BatteryData]:
         raw_file = Path(parentdir) / 'RWTH.zip'
 
         # Unzip the file first
@@ -54,9 +54,18 @@ def process(self, parentdir) -> List[BatteryData]:
         cells = [f'{i:03}' for i in range(2, 50)]
         if not self.silent:
             cells = tqdm(cells)
-        batteries = []
+
+        process_batteries_num = 0
+        skip_batteries_num = 0
         for cell in cells:
             name = f'RWTH_{cell}'
+
+            # judge whether to skip the processed file
+            whether_to_skip = self.check_processed_file(name)
+            if whether_to_skip == True:
+                skip_batteries_num += 1
+                continue
+
             if not self.silent:
                 cells.set_description(f'Processing csv files for cell {name}')
             files = datadir.glob(f'*{cell}=ZYK*Zyk*.csv')
@@ -90,13 +99,14 @@ def process(self, parentdir) -> List[BatteryData]:
             # Remove abnormal cycles
             Qds = np.array([max(x.discharge_capacity_in_Ah) for x in cycles])
             to_remove = remove_abnormal_cycle(Qds)
-            cycles = [cycle for i, cycle in enumerate(cycles) if not to_remove[i]]
+            cycles = [cycle for i, cycle in enumerate(
+                cycles) if not to_remove[i]]
             # Organize cell
             # The nominal capacity is 2.05Ah, but due to quality issue,
             # approximately 1.85Ah each. Cycling between 20% to 80% SoC
             # makes its nominal capacity 1.85 * 0.6 = 1.11 Ah.
             # See https://publications.rwth-aachen.de/record/818642/files/Content_RWTH-2021-04545.pdf  # noqa
-            batteries.append(BatteryData(
+            battery = BatteryData(
                 cell_id=name,
                 cycle_data=cycles,
                 form_factor='cylindrical_18650',
@@ -126,12 +136,18 @@ def process(self, parentdir) -> List[BatteryData]:
                 min_voltage_limit_in_V=3.5,
                 max_voltage_limit_in_V=3.9,
                 max_current_limit_in_A=4
-            ))
+            )
+
+            self.dump_single_file(battery)
+            process_batteries_num += 1
+
+            if not self.silent:
+                tqdm.write(f'File: {battery.cell_id} dumped to pkl file')
 
         # Remove the extracted files
         shutil.rmtree(subdir)
 
-        return batteries
+        return process_batteries_num, skip_batteries_num
 
 
 @njit
diff --git a/batteryml/preprocess/preprocess_SNL.py b/batteryml/preprocess/preprocess_SNL.py
index 02d9939..0d155fd 100644
--- a/batteryml/preprocess/preprocess_SNL.py
+++ b/batteryml/preprocess/preprocess_SNL.py
@@ -15,7 +15,7 @@
 
 @PREPROCESSORS.register()
 class SNLPreprocessor(BasePreprocessor):
-    def process(self, parentdir) -> List[BatteryData]:
+    def process(self, parentdir, **kwargs) -> List[BatteryData]:
         path = Path(parentdir)
         cells = set(
             x.stem.split('_timeseries')[0]
@@ -47,8 +47,17 @@ def process(self, parentdir) -> List[BatteryData]:
             'SNL_18650_NCA_25C_40-60_0.5-0.5C_b',
             'SNL_18650_NMC_25C_20-80_0.5-3C_b']
         cells = tuple(cell for cell in cells if cell not in to_drop)
-        batteries = []
+
+        process_batteries_num = 0
+        skip_batteries_num = 0
         for cell in tqdm(cells, desc='Processing SNL cells'):
+
+            # judge whether to skip the processed file
+            whether_to_skip = self.check_processed_file(cell)
+            if whether_to_skip == True:
+                skip_batteries_num += 1
+                continue
+
             timeseries_file = next(path.glob(f'*{cell}*timeseries*'))
             cycle_data_file = next(path.glob(f'*{cell}*cycle_data*'))
             timeseries_df = pd.read_csv(timeseries_file)
@@ -59,8 +68,15 @@ def process(self, parentdir) -> List[BatteryData]:
                 se = False
             timeseries_df, cycle_data_df = clean_snl_cell(
                 timeseries_df, cycle_data_df, should_exclude=se)
-            batteries.append(organize_cell(timeseries_df, cell))
-        return batteries
+
+            battery = organize_cell(timeseries_df, cell)
+            self.dump_single_file(battery)
+            process_batteries_num += 1
+
+            if not self.silent:
+                tqdm.write(f'File: {battery.cell_id} dumped to pkl file')
+
+        return process_batteries_num, skip_batteries_num
 
 
 def get_capacity(cell_name):
diff --git a/batteryml/preprocess/preprocess_UL_PUR.py b/batteryml/preprocess/preprocess_UL_PUR.py
index 5f768ac..4800afe 100644
--- a/batteryml/preprocess/preprocess_UL_PUR.py
+++ b/batteryml/preprocess/preprocess_UL_PUR.py
@@ -15,14 +15,21 @@
 
 @PREPROCESSORS.register()
 class UL_PURPreprocessor(BasePreprocessor):
-    def process(self, parentdir: str) -> List[BatteryData]:
+    def process(self, parentdir: str, **kwargs) -> List[BatteryData]:
         path = Path(parentdir)
         cells = set(
             x.stem.split('_timeseries')[0]
             for x in path.glob('*UL-PUR_N*timeseries*'))
 
-        batteries = []
+        process_batteries_num = 0
+        skip_batteries_num = 0
         for cell in tqdm(cells, desc='Processing UL-PUR cells'):
+            # judge whether to skip the processed file
+            whether_to_skip = self.check_processed_file(cell)
+            if whether_to_skip == True:
+                skip_batteries_num += 1
+                continue
+
             timeseries_file = next(path.glob(f'*{cell}*timeseries*'))
             cycle_data_file = next(path.glob(f'*{cell}*cycle_data*'))
             timeseries_df = pd.read_csv(timeseries_file)
@@ -31,9 +38,16 @@ def process(self, parentdir: str) -> List[BatteryData]:
                 continue
             timeseries_df, _ = clean_cell(
                 timeseries_df, cycle_data_df, shifts=4)
-            batteries.append(organize_cell(
-                timeseries_df, cell, get_capacity(cell)))
-        return batteries
+
+            battery = organize_cell(
+                timeseries_df, cell, get_capacity(cell))
+            self.dump_single_file(battery)
+            process_batteries_num += 1
+
+            if not self.silent:
+                tqdm.write(f'File: {battery.cell_id} dumped to pkl file')
+
+        return process_batteries_num, skip_batteries_num
 
 
 def get_capacity(cell_name):
diff --git a/batteryml/preprocess/preprocess_arbin.py b/batteryml/preprocess/preprocess_arbin.py
new file mode 100644
index 0000000..51db291
--- /dev/null
+++ b/batteryml/preprocess/preprocess_arbin.py
@@ -0,0 +1,226 @@
+# Licensed under the MIT License.
+# Copyright (c) Microsoft Corporation.
+
+import os
+import logging
+import pandas as pd
+from tqdm import tqdm
+from typing import List
+from pathlib import Path
+
+from batteryml.builders import PREPROCESSORS
+from batteryml.utils import import_config
+from batteryml.preprocess.base import BasePreprocessor
+from batteryml import BatteryData, CycleData, CyclingProtocol
+
+
+@PREPROCESSORS.register()
+class ARBINPreprocessor(BasePreprocessor):
+    def process(self, parentdir, config_path, **kwargs) -> List[BatteryData]:
+        if config_path is None or str(config_path) == "None":
+            raise ValueError("Config path is not specified.")
+        else:
+            CONFIG_FIELDS = ["column_names", "data_types"]
+            CONVERSION_CONFIG = import_config(Path(config_path), CONFIG_FIELDS)
+
+        cell_files = [f for f in Path(parentdir).iterdir(
+        ) if f.is_file() and not f.name.endswith('.yaml')]
+
+        if not self.silent:
+            cell_files = tqdm(
+                cell_files, desc='Processing data from ARBIN cycler')
+
+        process_batteries_num = 0
+        skip_batteries_num = 0
+        for cell_file in cell_files:
+            whether_to_skip = self.check_processed_file(
+                "ARBIN_" + cell_file.stem)
+            if whether_to_skip == True:
+                skip_batteries_num += 1
+                continue
+
+            logging.info(f'Processing cell_file: {cell_file.name}')
+
+            battery = organize_cell_file(cell_file, CONVERSION_CONFIG)
+            self.dump_single_file(battery)
+            process_batteries_num += 1
+
+            if not self.silent:
+                logging.info(f'File: {battery.cell_id} dumped to pkl file')
+
+        return process_batteries_num, skip_batteries_num
+
+
+def organize_cell_file(cell_file, CONVERSION_CONFIG):
+    file_readers = {
+        '.csv': pd.read_csv,
+        '.xlsx': pd.read_excel,
+        '.xls': pd.read_excel
+    }
+    data = pd.DataFrame()
+    file_processed = False
+    cell_file_suffix = cell_file.suffix
+    try:
+        if cell_file_suffix in ['.xlsx', '.xls']:
+            sheets = {sheet_name: df for sheet_name, df in file_readers[cell_file_suffix](
+                cell_file, sheet_name=None, index_col=0).items() if sheet_name != "Info"}
+            for sheet_df in sheets.values():
+                data = pd.concat([data, sheet_df])
+            file_processed = True
+        elif cell_file_suffix == '.csv':
+            data = file_readers[cell_file_suffix](cell_file, index_col=0)
+            file_processed = True
+        if not file_processed:
+            raise ValueError(
+                f"Unsupported file format: {cell_file.suffix}. Please provide a .csv, .xlsx, or .xls file.")
+    except Exception as e:
+        logging.error(f"Error processing file {cell_file}: {e}")
+
+    columns = {
+        v: k for k, v in CONVERSION_CONFIG["column_names"].items() if v in data.columns}
+    data.rename(columns=columns, inplace=True)
+
+    data_types = {
+        k: v for k, v in CONVERSION_CONFIG["data_types"].items() if k in data.columns}
+    data = data.astype(data_types)
+
+    cycles = data_cycles(data)
+
+    metadata_file_path = cell_file.with_suffix('.metadata.yaml')
+    metadata_file = metadata_file_path if os.path.exists(
+        metadata_file_path) else None
+    metadata = organize_metadata(metadata_file)
+
+    return organize_cell(cell_file.stem, cycles, metadata)
+
+
+def data_cycles(raw_data):
+    grouped_by_cycle_idx = raw_data.groupby('cycle_index')
+    columns_to_group_mapping = {
+        'step_index': 'step_index',
+        'current': 'I',
+        'voltage': 'V',
+        'charge_capacity': 'Qc',
+        'discharge_capacity': 'Qd',
+        'charge_energy': 'Ec',
+        'discharge_energy': 'Ed',
+        'temperature': 'T',
+        'internal_resistance': 'IR',
+        'test_time': 't',
+        'date_time_iso': 'date_time_iso',
+    }
+    grouped_data = {}
+    grouped_data['data_point'] = grouped_by_cycle_idx.apply(
+        lambda x: (x.index + 1 - x.index[0]).tolist()
+    )
+    for column in columns_to_group_mapping.keys():
+        if column in raw_data.columns:
+            try:
+                grouped_data[column] = grouped_by_cycle_idx[column].apply(list)
+            except Exception as e:
+                logging.warning(
+                    f'Failed to process column {column} to group: {e}')
+        else:
+            grouped_data[column] = grouped_by_cycle_idx.apply(
+                lambda x: [None]*len(x))
+
+    cycle_dict = {}
+    all_cycles = set(range(max(grouped_by_cycle_idx.groups.keys()) + 1))
+    existing_cycles = set(grouped_by_cycle_idx.groups.keys())
+
+    missing_cycles = all_cycles - existing_cycles
+    for missing_cycle in missing_cycles:
+        logging.warning(f"Data of cycle {missing_cycle} missed.")
+
+    for cdi, i in enumerate(grouped_by_cycle_idx.groups.keys()):
+        cd = {}
+        try:
+            cd['data_point'] = grouped_data['data_point'][i]
+            for field in columns_to_group_mapping.keys():
+                if field == 'internal_resistance':
+                    #####################################################################
+                    # Assume the last IR of each cycle is representative of that cycle. #
+                    #####################################################################
+                    cd['IR'] = grouped_data[field][i][-1]
+                elif field == 'test_time':
+                    min_date_time = min(grouped_data[field][i])
+                    cd['t'] = [
+                        time - min_date_time for time in grouped_data[field][i]]
+                else:
+                    cd[columns_to_group_mapping[field]] = grouped_data[field][i]
+        except Exception as e:
+            logging.warning(f"Error processing field '{field}' in cycle {i}")
+        cycle_dict[str(cdi)] = cd
+
+    return cycle_dict
+
+
+# Need adjusting to custom metadata
+def organize_metadata(meta_path):
+    METADATA_CONFIG_FIELDS = ["form_factor", "anode_material", "cathode_material",
+                              "nominal_capacity_in_Ah",
+                              "min_voltage_limit_in_V", "max_voltage_limit_in_V",
+                              "charge_protocol", "discharge_protocol"]
+    METADATA_CONFIG = {field: None for field in METADATA_CONFIG_FIELDS}
+
+    try:
+        if meta_path is None or str(meta_path) == "None":
+            raise ValueError("Metadata config path is not specified.")
+        config = import_config(Path(meta_path), METADATA_CONFIG_FIELDS)
+        METADATA_CONFIG.update(config)
+    except (ValueError, FileNotFoundError) as e:
+        logging.error(e)
+
+    charge_protocols = [CyclingProtocol(
+        **cp) for cp in METADATA_CONFIG.get('charge_protocol', []) or []]
+    discharge_protocols = [CyclingProtocol(
+        **dp) for dp in METADATA_CONFIG.get('discharge_protocol', []) or []]
+
+    metadata = {
+        "form_factor": METADATA_CONFIG.get("form_factor"),
+        "anode_material": METADATA_CONFIG.get("anode_material"),
+        "cathode_material": METADATA_CONFIG.get("cathode_material"),
+        "charge_protocol": charge_protocols,
+        "discharge_protocol": discharge_protocols,
+        "nominal_capacity_in_Ah": METADATA_CONFIG.get("nominal_capacity_in_Ah"),
+        "min_voltage_limit_in_V": METADATA_CONFIG.get("min_voltage_limit_in_V"),
+        "max_voltage_limit_in_V": METADATA_CONFIG.get("max_voltage_limit_in_V")
+    }
+    return metadata
+
+
+def organize_cell(name, cycles, metadata):
+    cycle_data = []
+    for cycle_idx, cycle in cycles.items():
+        # Skip the first cycle if it is necessary
+        # if int(cycle_idx) == 0:
+        #     continue
+        cycle_data.append(CycleData(
+            cycle_number=cycle_idx,
+            voltage_in_V=cycle['V'],
+            current_in_A=cycle['I'],
+            charge_capacity_in_Ah=cycle['Qc'],
+            discharge_capacity_in_Ah=cycle['Qd'],
+            time_in_s=cycle['t'],
+            temperature_in_C=cycle['T'],
+            internal_resistance_in_ohm=cycle['IR'],
+
+            energy_charge=cycle['Ec'],
+            energy_discharge=cycle['Ed'],
+            step_index=cycle['step_index'],
+            data_point=cycle['data_point'],
+            date_time_iso=cycle['date_time_iso']
+        ))
+
+    return BatteryData(
+        cell_id=f'ARBIN_{name}',
+        cycle_data=cycle_data,
+        form_factor=metadata["form_factor"],
+        anode_material=metadata["anode_material"],
+        cathode_material=metadata["cathode_material"],
+        charge_protocol=metadata["charge_protocol"],
+        discharge_protocol=metadata["discharge_protocol"],
+        nominal_capacity_in_Ah=metadata["nominal_capacity_in_Ah"],
+        min_voltage_limit_in_V=metadata["min_voltage_limit_in_V"],
+        max_voltage_limit_in_V=metadata["max_voltage_limit_in_V"]
+    )
diff --git a/batteryml/preprocess/preprocess_neware.py b/batteryml/preprocess/preprocess_neware.py
new file mode 100644
index 0000000..1122a03
--- /dev/null
+++ b/batteryml/preprocess/preprocess_neware.py
@@ -0,0 +1,262 @@
+# Licensed under the MIT License.
+# Copyright (c) Microsoft Corporation.
+
+import os
+import logging
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from typing import List
+from pathlib import Path
+
+from batteryml.builders import PREPROCESSORS
+from batteryml.utils import import_config
+from batteryml.preprocess.base import BasePreprocessor
+from batteryml import BatteryData, CycleData, CyclingProtocol
+
+
+@PREPROCESSORS.register()
+class NEWAREPreprocessor(BasePreprocessor):
+    def process(self, parentdir, config_path, **kwargs) -> List[BatteryData]:
+        if config_path is None or str(config_path) == "None":
+            raise ValueError("Config path is not specified.")
+        else:
+            CONFIG_FIELDS = ["column_names", "data_types", "scales"]
+            CONVERSION_CONFIG = import_config(Path(config_path), CONFIG_FIELDS)
+
+        cell_files = [f for f in Path(parentdir).iterdir(
+        ) if f.is_file() and not f.name.endswith('.yaml')]
+
+        if not self.silent:
+            cell_files = tqdm(
+                cell_files, desc='Processing data from NEWARE cycler')
+
+        process_batteries_num = 0
+        skip_batteries_num = 0
+        for cell_file in cell_files:
+            whether_to_skip = self.check_processed_file(
+                "NEWARE_"+cell_file.stem)
+            if whether_to_skip == True:
+                skip_batteries_num += 1
+                continue
+
+            logging.info(f'Processing cell_file: {cell_file.name}')
+
+            battery = organize_cell_file(cell_file, CONVERSION_CONFIG)
+            self.dump_single_file(battery)
+            process_batteries_num += 1
+
+            if not self.silent:
+                logging.info(f'File: {battery.cell_id} dumped to pkl file')
+
+        return process_batteries_num, skip_batteries_num
+
+
+def organize_cell_file(cell_file, CONVERSION_CONFIG):
+    ir_column_name = '"DCIR(O)"'
+
+    record_data = []
+    with open(cell_file, encoding="ISO-8859-1") as input:
+        cycle_header = input.readline().replace("\t", "")
+        step_header = input.readline().replace("\t", "")
+        ir_index = step_header.split(",").index(ir_column_name)
+        record_header = input.readline().replace("\t", "").split(",")
+        record_header[0] = cycle_header.split(",")[0]
+        record_header[1] = step_header.split(",")[1]
+        record_header[22] = ir_column_name
+        record_header = ",".join(record_header)
+        record_header = record_header.encode("ascii", "ignore").decode()
+
+        cycle_number = 0
+        step_number = 0
+        ir_value = None
+        for line in input:
+            if line[:2] == r',"':  # step data
+                step_number = line.split(",")[1]
+                ir_value = line.split(",")[ir_index]
+            elif line[:2] == r",,":  # record data
+                line_list = line.split(",")
+                line_list[0] = cycle_number
+                line_list[1] = step_number
+                line_list[22] = ir_value
+                record_data.append(line_list)
+            else:  # cycle data
+                cycle_number = line.split(",")[0]
+
+    cleaned_columns = [col.replace('"', '')
+                       for col in record_header.split(",")]
+    record_df = pd.DataFrame(record_data, columns=cleaned_columns)
+    record_df = record_df.replace({'\t': '', '"': ''}, regex=True)
+
+    data = record_df.loc[:, ~record_df.columns.str.contains("Unnamed")]
+
+    data["Time(h:min:s.ms)"] = data["Time(h:min:s.ms)"].apply(
+        lambda x: 3600 * float(x.split(":")[-3]) + 60 * float(x.split(":")[-2]) + float(x.split(":")[-1]))
+
+    # Deal with missing data in the internal resistance
+    data["DCIR(O)"] = data["DCIR(O)"].apply(
+        lambda x: np.nan if x == "-" else x
+    )
+
+    columns = {
+        v: k for k, v in CONVERSION_CONFIG["column_names"].items() if v in data.columns}
+    data.rename(columns=columns, inplace=True)
+
+    data_types = {
+        k: v for k, v in CONVERSION_CONFIG["data_types"].items() if k in data.columns}
+    data = data.astype(data_types)
+
+    scales = {k: v for k,
+              v in CONVERSION_CONFIG["scales"].items() if k in data.columns}
+    for column, scale in scales.items():
+        data[column] *= scale
+
+    data["internal_resistance"] = data["internal_resistance"].ffill()
+    data["internal_resistance"] = data["internal_resistance"].bfill()
+
+    data["test_time"] = (
+        data["step_time"].diff().fillna(0).apply(
+            lambda x: 0 if x < 0 else x).cumsum()
+    )
+
+    cycles = data_cycles(data)
+
+    metadata_file_path = cell_file.with_suffix('.metadata.yaml')
+    metadata_file = metadata_file_path if os.path.exists(
+        metadata_file_path) else None
+    metadata = organize_metadata(metadata_file)
+
+    return organize_cell(cell_file.stem, cycles, metadata)
+
+
+def data_cycles(raw_data):
+    grouped_by_cycle_idx = raw_data.groupby('cycle_index')
+    columns_to_group_mapping = {
+        'data_point': 'data_point',
+        'step_index': 'step_index',
+        'current': 'I',
+        'voltage': 'V',
+        'charge_capacity': 'Qc',
+        'discharge_capacity': 'Qd',
+        'charge_energy': 'Ec',
+        'discharge_energy': 'Ed',
+        'temperature': 'T',
+        'internal_resistance': 'IR',
+        'test_time': 't',
+        'date_time': 'date_time_iso',
+    }
+    grouped_data = {}
+    grouped_data['data_point'] = grouped_by_cycle_idx.apply(
+        lambda x: (x.index + 1 - x.index[0]).tolist()
+    )
+    for column in columns_to_group_mapping.keys():
+        if column in raw_data.columns:
+            try:
+                grouped_data[column] = grouped_by_cycle_idx[column].apply(list)
+            except Exception as e:
+                logging.warning(
+                    f'Failed to process column {column} to group: {e}')
+        else:
+            grouped_data[column] = grouped_by_cycle_idx.apply(
+                lambda x: [None]*len(x))
+
+    cycle_dict = {}
+    all_cycles = set(range(max(grouped_by_cycle_idx.groups.keys()) + 1))
+    existing_cycles = set(grouped_by_cycle_idx.groups.keys())
+
+    missing_cycles = all_cycles - existing_cycles
+    for missing_cycle in missing_cycles:
+        logging.warning(f"Data of cycle {missing_cycle} missed.")
+
+    for cdi, i in enumerate(grouped_by_cycle_idx.groups.keys()):
+        cd = {}
+        try:
+            cd['data_point'] = grouped_data['data_point'][i]
+            for field in columns_to_group_mapping.keys():
+                if field == 'internal_resistance':
+                    #####################################################################
+                    # Assume the last IR of each cycle is representative of that cycle. #
+                    #####################################################################
+                    cd['IR'] = grouped_data[field][i][-1]
+                elif field == 'test_time':
+                    min_date_time = min(grouped_data[field][i])
+                    cd['t'] = [
+                        time - min_date_time for time in grouped_data[field][i]]
+                else:
+                    cd[columns_to_group_mapping[field]] = grouped_data[field][i]
+        except Exception as e:
+            logging.warning(
+                f"Error processing field '{field}' in cycle {i}")
+        cycle_dict[str(cdi)] = cd
+
+    return cycle_dict
+
+
+# Need adjusting to custom metadata
+def organize_metadata(meta_path):
+    METADATA_CONFIG_FIELDS = ["form_factor", "anode_material", "cathode_material",
+                              "nominal_capacity_in_Ah",
+                              "min_voltage_limit_in_V", "max_voltage_limit_in_V",
+                              "charge_protocol", "discharge_protocol"]
+    METADATA_CONFIG = {field: None for field in METADATA_CONFIG_FIELDS}
+
+    try:
+        if meta_path is None or str(meta_path) == "None":
+            raise ValueError("Metadata config path is not specified.")
+        config = import_config(Path(meta_path), METADATA_CONFIG_FIELDS)
+        METADATA_CONFIG.update(config)
+    except (ValueError, FileNotFoundError) as e:
+        logging.error(e)
+
+    charge_protocols = [CyclingProtocol(
+        **cp) for cp in METADATA_CONFIG.get('charge_protocol', []) or []]
+    discharge_protocols = [CyclingProtocol(
+        **dp) for dp in METADATA_CONFIG.get('discharge_protocol', []) or []]
+
+    metadata = {
+        "form_factor": METADATA_CONFIG.get("form_factor"),
+        "anode_material": METADATA_CONFIG.get("anode_material"),
+        "cathode_material": METADATA_CONFIG.get("cathode_material"),
+        "charge_protocol": charge_protocols,
+        "discharge_protocol": discharge_protocols,
+        "nominal_capacity_in_Ah": METADATA_CONFIG.get("nominal_capacity_in_Ah"),
+        "min_voltage_limit_in_V": METADATA_CONFIG.get("min_voltage_limit_in_V"),
+        "max_voltage_limit_in_V": METADATA_CONFIG.get("max_voltage_limit_in_V")
+    }
+    return metadata
+
+
+def organize_cell(name, cycles, metadata):
+    cycle_data = []
+    for cycle_idx, cycle in cycles.items():
+        # Skip the first cycle if it is necessary
+        # if int(cycle_idx) == 0:
+        #     continue
+        cycle_data.append(CycleData(
+            cycle_number=cycle_idx,
+            voltage_in_V=cycle['V'],
+            current_in_A=cycle['I'],
+            charge_capacity_in_Ah=cycle['Qc'],
+            discharge_capacity_in_Ah=cycle['Qd'],
+            time_in_s=cycle['t'],
+            temperature_in_C=cycle['T'],
+            internal_resistance_in_ohm=cycle['IR'],
+
+            energy_charge=cycle['Ec'],
+            energy_discharge=cycle['Ed'],
+            step_index=cycle['step_index'],
+            data_point=cycle['data_point']
+        ))
+
+    return BatteryData(
+        cell_id=f'NEWARE_{name}',
+        cycle_data=cycle_data,
+        form_factor=metadata["form_factor"],
+        anode_material=metadata["anode_material"],
+        cathode_material=metadata["cathode_material"],
+        charge_protocol=metadata["charge_protocol"],
+        discharge_protocol=metadata["discharge_protocol"],
+        nominal_capacity_in_Ah=metadata["nominal_capacity_in_Ah"],
+        min_voltage_limit_in_V=metadata["min_voltage_limit_in_V"],
+        max_voltage_limit_in_V=metadata["max_voltage_limit_in_V"]
+    )
diff --git a/batteryml/utils/config.py b/batteryml/utils/config.py
index b4bdb5b..900c2e4 100644
--- a/batteryml/utils/config.py
+++ b/batteryml/utils/config.py
@@ -6,6 +6,7 @@
 import os
 from addict import Dict
 
+
 def import_config(path: str, attr: list):
     """_summary_
 
@@ -27,6 +28,18 @@ def import_config(path: str, attr: list):
 
     if not isinstance(attr, list):
         attr = [attr]
+
+    # Check if expected attributes are present in the config file
+    missing_fields = [field for field in attr if field not in config]
+    if missing_fields:
+        raise ValueError(f"Missing expected config fields: {missing_fields}")
+
+    for field in attr:
+        value = config[field]
+        # Check if the value is not None or empty
+        if value is None or (isinstance(value, (str, list)) and not value):
+            raise ValueError(f"Invalid value for config field: {field}")
+
     return {
         field: getattr(config, field) if hasattr(config, field) else {}
         for field in attr
@@ -36,6 +49,7 @@ def import_config(path: str, attr: list):
 class YamlHandler:
     """handle yaml file
     """
+
     def __init__(self, file_path):
         """ YamlHandler init
         Parameters
@@ -44,7 +58,7 @@ def __init__(self, file_path):
             yaml file path of config
         """
         if not os.path.exists(file_path):
-            return FileExistsError(OSError)
+            raise FileNotFoundError(f"The file {file_path} does not exist.")
         self.file_path = file_path
 
     def read_yaml(self, encoding='utf-8'):
diff --git a/bin/batteryml.py b/bin/batteryml.py
index ead542c..2f8ae47 100755
--- a/bin/batteryml.py
+++ b/bin/batteryml.py
@@ -21,10 +21,10 @@ def main():
 
     # download command
     download_parser = subparsers.add_parser(
-        "download", help="Download raw files for public datasets")  
+        "download", help="Download raw files for public datasets")
     download_parser.add_argument(
         "dataset", choices=list(DOWNLOAD_LINKS.keys()),
-        help="Public dataset to download")  
+        help="Public dataset to download")
     download_parser.add_argument(
         "output_dir", help="Directory to save the raw data files")
     download_parser.set_defaults(func=download)
@@ -34,11 +34,14 @@ def main():
         "preprocess",
         help="Organize the raw data files into BatteryData and save to disk")
     preprocess_parser.add_argument(
-        "input_type", choices=SUPPORTED_SOURCES,
+        "input_type", choices=[value for values in SUPPORTED_SOURCES.values() for value in values],
         help="Type of input raw files. For public datasets, specific "
              "preprocessor will be called. For standard battery test "
              "output files, the corresponding preprocessing logic "
              "will be applied.")
+    preprocess_parser.add_argument(
+        "--config", default="None",
+        help="Path to the config file of Cycler.")
     preprocess_parser.add_argument(
         "raw_dir", help="Directory of raw input files.")
     preprocess_parser.add_argument(
@@ -94,22 +97,24 @@ def download(args):
 
 
 def preprocess(args):
-    assert os.path.exists(args.raw_dir), f'Input path not exist: {args.raw_dir}'
+    assert os.path.exists(
+        args.raw_dir), f'Input path not exist: {args.raw_dir}'
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
 
+    config_path = Path(args.config)
     input_path, output_path = Path(args.raw_dir), Path(args.output_dir)
     processor = PREPROCESSORS.build(dict(
         name=f'{args.input_type}Preprocessor',
         output_dir=output_path,
         silent=args.silent
     ))
-    processor(input_path)
+    processor(input_path, config_path=config_path)
 
 
 def run(args):
-    # Convert skip_if_executed to boolean  
-    args.skip_if_executed = args.skip_if_executed.lower() in ['true', '1', 'yes']  
+    # Convert skip_if_executed to boolean
+    args.skip_if_executed = args.skip_if_executed.lower() in ['true', '1', 'yes']
     pipeline = Pipeline(args.config, args.workspace)
     model, dataset = None, None  # Reuse to save setup cost
     if args.train:
diff --git a/configs/cyclers/arbin.yaml b/configs/cyclers/arbin.yaml
new file mode 100644
index 0000000..4ff15c2
--- /dev/null
+++ b/configs/cyclers/arbin.yaml
@@ -0,0 +1,37 @@
+column_names:
+  # columns names of raw_data:
+  # ['Test_Time', 'DateTime', 'Step_Time', 'Step_Index', 'Cycle_Index', 'Current', 'Voltage', 'Charge_Capacity', 'Discharge_Capacity', 'Charge_Energy', 'Discharge_Energy', 'dV/dt', 'Internal_Resistance', 'Temperature']
+  test_time: 'Test_Time'
+  date_time: 'DateTime'
+  step_time: 'Step_Time'
+  step_index: 'Step_Index'
+  cycle_index: 'Cycle_Index'
+  current: 'Current'
+  voltage: 'Voltage'
+  charge_capacity: 'Charge_Capacity'
+  discharge_capacity: 'Discharge_Capacity'
+  charge_energy: 'Charge_Energy'
+  discharge_energy: 'Discharge_Energy'
+  _dv/dt: 'dV/dt'
+  internal_resistance: 'Internal_Resistance'
+  temperature: 'Temperature'
+
+  # none:
+  data_point: 'Data_Point'
+    
+data_types:
+  data_point: 'int32'
+  test_time: 'float64'
+  date_time: 'float32'
+  step_time: 'float32'
+  step_index: 'int16'
+  cycle_index: 'int32'
+  current: 'float32'
+  voltage: 'float32'
+  charge_capacity: 'float64'
+  discharge_capacity: 'float64'
+  charge_energy: 'float64'
+  discharge_energy: 'float64'
+  dv/dt: 'float32'
+  internal_resistance: 'float32'
+  temperature: 'float32'
\ No newline at end of file
diff --git a/configs/cyclers/arbin_metadata.yaml b/configs/cyclers/arbin_metadata.yaml
new file mode 100644
index 0000000..1a63398
--- /dev/null
+++ b/configs/cyclers/arbin_metadata.yaml
@@ -0,0 +1,42 @@
+form_factor: 1
+anode_material: 2
+cathode_material: 3
+nominal_capacity_in_Ah: None
+min_voltage_limit_in_V: None
+max_voltage_limit_in_V: None
+
+charge_protocol:
+  - rate_in_C: 0.5
+    current_in_A: 1.2
+    voltage_in_V: 3.7
+    power_in_W: null
+    start_voltage_in_V: null
+    start_soc: 0.0
+    end_voltage_in_V: 4.2
+    end_soc: 1.0
+  - rate_in_C: 1.0
+    current_in_A: 2.0
+    voltage_in_V: 3.8
+    power_in_W: null
+    start_voltage_in_V: null
+    start_soc: 0.0
+    end_voltage_in_V: 4.1
+    end_soc: 1.0
+
+discharge_protocol:
+  - rate_in_C: 0.5
+    current_in_A: 1.0
+    voltage_in_V: 3.2
+    power_in_W: null
+    start_voltage_in_V: null
+    start_soc: 1.0
+    end_voltage_in_V: 2.5
+    end_soc: 0.0
+  - rate_in_C: 0.8
+    current_in_A: 1.5
+    voltage_in_V: 3.3
+    power_in_W: null
+    start_voltage_in_V: null
+    start_soc: 1.0
+    end_voltage_in_V: 2.7
+    end_soc: 0.0
diff --git a/configs/cyclers/neware.yaml b/configs/cyclers/neware.yaml
new file mode 100644
index 0000000..ccf76d0
--- /dev/null
+++ b/configs/cyclers/neware.yaml
@@ -0,0 +1,42 @@
+column_names:
+  # columns names of raw_data:
+  # ['Cycle ID', 'Step ID', 'Record ID', 'Time(h:min:s.ms)', 'Voltage(V)', 'Current(mA)', 'Temperature(C)', 'Capacity(mAh)', 'Capacity Density(mAh/g)', 'Energy(mWh)', 'CmpEng(mWh/g)', 'Realtime', 'Min-T(C)', 'Max-T(C)', 'Avg-T(C)', 'Power(mW)', 'Capacitance_Chg(mAh)', 'Capacitance_DChg(mAh)', 'Engy_Chg(mWh)', 'Engy_DChg(mWh)', 'dQ/dV(mAh/V)', 'dQm/dV(mAh/V.g)', 'DCIR(O)']
+  cycle_index: 'Cycle ID'
+  step_index: 'Step ID'
+  voltage: 'Voltage(V)'
+  current: 'Current(mA)'
+  discharge_capacity: 'Capacitance_DChg(mAh)'
+  charge_capacity: 'Capacitance_Chg(mAh)'
+  charge_energy: 'Engy_Chg(mWh)'
+  discharge_energy: 'Engy_DChg(mWh)'
+  date_time: 'Realtime'
+  data_point: 'Record ID'
+  step_time: 'Time(h:min:s.ms)'
+  temperature: 'Temperature(C)'
+  internal_resistance: 'DCIR(O)'
+
+data_types:
+  data_point: 'int32'
+  test_time: 'float64'
+  step_time: 'float32'
+  step_index: 'int16'
+  cycle_index: 'int32'
+  current: 'float32'
+  voltage: 'float32'
+  discharge_capacity: 'float64'
+  charge_capacity: 'float64'
+  charge_energy: 'float64'
+  discharge_energy: 'float64'
+  internal_resistance: 'float32'
+  temperature: 'float32'
+
+# for the conversion of variable units:
+# example:
+# time: ms -> s
+# time: 0.001
+scales:
+  current: 0.001
+  charge_capacity: 0.001
+  discharge_capacity: 0.001
+  charge_energy: 0.001
+  discharge_energy: 0.001

From c2131f8f2c1eca8af1500188fe34aa985e33818c Mon Sep 17 00:00:00 2001
From: GCR-1178 <jiaweichen.china@gmail.com>
Date: Wed, 3 Jul 2024 09:35:44 +0000
Subject: [PATCH 2/2] update dependencies

---
 batteryml/preprocess/download.py | 6 +++---
 requirements.txt                 | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/batteryml/preprocess/download.py b/batteryml/preprocess/download.py
index 709c4fc..65db334 100644
--- a/batteryml/preprocess/download.py
+++ b/batteryml/preprocess/download.py
@@ -69,7 +69,7 @@ def memory2str(mem):
         print(f'[INFO] {filename} already exists. Skip it.')
         return
     with open(filename, 'wb') as f:
-        response = requests.get(url, stream=True)
+        response = requests.get(url, stream=True, verify=False)
         if total_length is None:
             total_length = response.headers.get('content-length')
         if response.status_code != 200:
@@ -82,8 +82,8 @@ def memory2str(mem):
             total_size = memory2str(total_length)
             bar_format = (
                 f'Downloading {filename}'
-                 '|{percentage:3.0f}%|{bar:20}|{desc}'
-                 '[{elapsed}<{remaining}{postfix}]')
+                '|{percentage:3.0f}%|{bar:20}|{desc}'
+                '[{elapsed}<{remaining}{postfix}]')
             if update_interval * chunk_size * 100 >= total_length:
                 update_interval = 1
             with tqdm(total=total_length, bar_format=bar_format) as bar:
diff --git a/requirements.txt b/requirements.txt
index 865be1c..9172e8c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,5 @@ numba
 matplotlib
 h5py
 openpyxl
+requests
+xgboost
\ No newline at end of file