diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9511644..66e4715 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -37,7 +37,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install package and dependencies - run: pip install rundoc . + run: pip install invoke rundoc . - name: make test-readme run: make test-readme @@ -46,7 +46,7 @@ jobs: strategy: matrix: python-version: [3.7, 3.8] - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} @@ -54,7 +54,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install package and dependencies - run: pip install .[test] + run: pip install invoke .[test] - name: make test-unit run: make test-unit @@ -63,7 +63,7 @@ jobs: strategy: matrix: python-version: [3.7, 3.8] - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} @@ -71,6 +71,6 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install package and dependencies - run: pip install .[test] + run: pip install invoke .[test] - name: make test-tutorials run: make test-tutorials \ No newline at end of file diff --git a/Makefile b/Makefile index 89530a1..d81ae6d 100644 --- a/Makefile +++ b/Makefile @@ -84,8 +84,7 @@ install-develop: clean-build clean-pyc ## install the package in editable mode a .PHONY: lint lint: ## check style with flake8 and isort - flake8 zephyr_ml tests - isort -c --recursive zephyr_ml tests + invoke lint .PHONY: fix-lint fix-lint: ## fix lint issues using autoflake, autopep8, and isort @@ -97,21 +96,15 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort .PHONY: test-unit test-unit: ## run tests quickly with the default Python - python -m pytest --cov=zephyr_ml + invoke pytest .PHONY: test-readme test-readme: ## run the readme snippets - rm -rf tests/readme_test && mkdir -p tests/readme_test/notebooks - cp -r notebooks/data tests/readme_test/notebooks/ - cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md - rm -rf tests/readme_test - + invoke readme .PHONY: test-tutorials test-tutorials: ## run the tutorial notebooks - find notebooks -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \ - jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --to=html --stdout {} > /dev/null \; - + invoke tutorials .PHONY: test test: test-unit test-readme test-tutorials ## test everything that needs test dependencies diff --git a/notebooks/feature_engineering.ipynb b/notebooks/feature_engineering.ipynb index b4fb48f..ea3c726 100644 --- a/notebooks/feature_engineering.ipynb +++ b/notebooks/feature_engineering.ipynb @@ -228,6 +228,24 @@ "pidata_es" ] }, + { + "cell_type": "markdown", + "id": "ffe532a6", + "metadata": {}, + "source": [ + "To visualize the entityset and its relationships, use `.plot` functionality." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cc618e40", + "metadata": {}, + "outputs": [], + "source": [ + "# pidata_es.plot('viz.png')" + ] + }, { "cell_type": "markdown", "id": "4746e7e1", @@ -239,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "e0ee16eb", "metadata": {}, "outputs": [ @@ -285,7 +303,7 @@ "0 0 2022-01-01 45801.0" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -299,12 +317,208 @@ "label_times" ] }, + { + "cell_type": "markdown", + "id": "ab8eefd3", + "metadata": {}, + "source": [ + "## 3) Feature Engineering with SigPro\n", + "\n", + "Process signals with [SigPro](https://github.com/sintel-dev/SigPro) for PI signals or SCADA signals.\n", + "\n", + "Processing signals is done by specifying the `transformations` and `aggregations` we wish to apply to the data. To look at some of the primitives readily available, we use `get_primitives` function from `SigPro`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "191a123a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sigpro.SigPro',\n", + " 'sigpro.aggregations.amplitude.statistical.crest_factor',\n", + " 'sigpro.aggregations.amplitude.statistical.kurtosis',\n", + " 'sigpro.aggregations.amplitude.statistical.mean',\n", + " 'sigpro.aggregations.amplitude.statistical.rms',\n", + " 'sigpro.aggregations.amplitude.statistical.skew',\n", + " 'sigpro.aggregations.amplitude.statistical.std',\n", + " 'sigpro.aggregations.amplitude.statistical.var',\n", + " 'sigpro.aggregations.frequency.band.band_mean',\n", + " 'sigpro.transformations.amplitude.identity.identity',\n", + " 'sigpro.transformations.amplitude.spectrum.power_spectrum',\n", + " 'sigpro.transformations.frequency.band.frequency_band',\n", + " 'sigpro.transformations.frequency.fft.fft',\n", + " 'sigpro.transformations.frequency.fft.fft_real',\n", + " 'sigpro.transformations.frequency_time.stft.stft',\n", + " 'sigpro.transformations.frequency_time.stft.stft_real']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sigpro import get_primitives\n", + "\n", + "get_primitives()" + ] + }, + { + "cell_type": "markdown", + "id": "5b23aff6", + "metadata": {}, + "source": [ + "Suppose we are interested in finding the amplitude mean for each month of readings in the signal. We first specify the `name` and respective `primitive` we want to apply for both `transformations` and `aggregations`.\n", + "\n", + "In this case, we are interested in an identity transformation and mean aggregation." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "961af0ef", + "metadata": {}, + "outputs": [], + "source": [ + "aggregations = [{\n", + " \"name\":\"mean\",\n", + " \"primitive\":\"sigpro.aggregations.amplitude.statistical.mean\"\n", + "}]\n", + "\n", + "transformations = [{\n", + " \"name\":\"fft\",\n", + " \"primitive\":\"sigpro.transformations.amplitude.identity.identity\"\n", + "}]" + ] + }, + { + "cell_type": "markdown", + "id": "a9a3f3a6", + "metadata": {}, + "source": [ + "We use `process_signals` function to accomplish our goal. We pass the following:\n", + "- `es`: the entityset we are working with.\n", + "- `signal_dataframe_name`: the name of the dataframe whether `pidata` or `scada`.\n", + "- `signal_column`: the name of the signal column in the dataframe.\n", + "- `window_size`: the size of the bin we want to process the signals over, e.g. each month.\n", + "- `replace_dataframe`: an indicator whether we want to replace the current dataframe or add it as a new one." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bea94368", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/Zephyr/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3474: RuntimeWarning: Mean of empty slice.\n", + " return _methods._mean(a, axis=axis, dtype=dtype,\n", + "/Users/sarah/anaconda3/envs/Zephyr/lib/python3.8/site-packages/numpy/core/_methods.py:189: RuntimeWarning: invalid value encountered in double_scalars\n", + " ret = ret.dtype.type(ret / rcount)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_indexCOD_ELEMENTtimefft.mean.mean_value
0002022-01-319872
1102022-02-28<NA>
2202022-03-31559
\n", + "
" + ], + "text/plain": [ + " _index COD_ELEMENT time fft.mean.mean_value\n", + "0 0 0 2022-01-31 9872\n", + "1 1 0 2022-02-28 \n", + "2 2 0 2022-03-31 559" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from zephyr_ml.feature_engineering import process_signals\n", + "\n", + "process_signals(es=pidata_es, \n", + " signal_dataframe_name='pidata', \n", + " signal_column='val1', \n", + " transformations=transformations, \n", + " aggregations=aggregations,\n", + " window_size='1m', \n", + " replace_dataframe=False)\n", + "\n", + "pidata_es['pidata_processed']" + ] + }, + { + "cell_type": "markdown", + "id": "fd88812a", + "metadata": {}, + "source": [ + "Based on our original observations of `val1`, we now have `pidata_processed` with an entry for each month and the respective mean value of observations we see in that month.\n", + "\n", + "**Note**: in the months we don't have observations, the value becomes null." + ] + }, { "cell_type": "markdown", "id": "5aacf99b", "metadata": {}, "source": [ - "## 3) Feature Engineering with Featuretools\n", + "## 4) Feature Engineering with Featuretools\n", "Using EntitySets and LabelTimes allows us to easily use Featuretools for automatic feature generation. For example, we can set interesting categorical values in our EntitySet and use them to generate aggregation features grouped by those interesting values. We can also set which primitives we want to use and control which columns and entities those primitives can be applied to. Featuretools can also use label times as cutoff times, ensuring that data after the label times is not used in feature generation. \n", "\n", "For additonal help using Featuretools, please see the documentation: https://featuretools.alteryx.com/en/stable/index.html" @@ -312,7 +526,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "id": "ee020300", "metadata": {}, "outputs": [ @@ -349,10 +563,14 @@ " ,\n", " ,\n", " ,\n", - " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", " ,\n", - " ,\n", + " ,\n", " ,\n", + " ,\n", " ,\n", " ,\n", " ,\n", @@ -364,7 +582,7 @@ " ]" ] }, - "execution_count": 5, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -391,7 +609,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "id": "bdce0acf", "metadata": {}, "outputs": [ @@ -493,7 +711,7 @@ " \n", " \n", "\n", - "

1 rows × 44 columns

\n", + "

1 rows × 48 columns

\n", "" ], "text/plain": [ @@ -549,10 +767,10 @@ "COD_ELEMENT time \n", "0 2022-01-01 3.0 45801.0 \n", "\n", - "[1 rows x 44 columns]" + "[1 rows x 48 columns]" ] }, - "execution_count": 6, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -560,202 +778,6 @@ "source": [ "feature_matrix" ] - }, - { - "cell_type": "markdown", - "id": "ab8eefd3", - "metadata": {}, - "source": [ - "## 3) Feature Engineering with SigPro\n", - "\n", - "Process signals with [SigPro](https://github.com/sintel-dev/SigPro) for PI signals or SCADA signals.\n", - "\n", - "Processing signals is done by specifying the `transformations` and `aggregations` we wish to apply to the data. To look at some of the primitives readily available, we use `get_primitives` function from `SigPro`." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "191a123a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['sigpro.SigPro',\n", - " 'sigpro.aggregations.amplitude.statistical.crest_factor',\n", - " 'sigpro.aggregations.amplitude.statistical.kurtosis',\n", - " 'sigpro.aggregations.amplitude.statistical.mean',\n", - " 'sigpro.aggregations.amplitude.statistical.rms',\n", - " 'sigpro.aggregations.amplitude.statistical.skew',\n", - " 'sigpro.aggregations.amplitude.statistical.std',\n", - " 'sigpro.aggregations.amplitude.statistical.var',\n", - " 'sigpro.aggregations.frequency.band.band_mean',\n", - " 'sigpro.transformations.amplitude.identity.identity',\n", - " 'sigpro.transformations.amplitude.spectrum.power_spectrum',\n", - " 'sigpro.transformations.frequency.band.frequency_band',\n", - " 'sigpro.transformations.frequency.fft.fft',\n", - " 'sigpro.transformations.frequency.fft.fft_real',\n", - " 'sigpro.transformations.frequency_time.stft.stft',\n", - " 'sigpro.transformations.frequency_time.stft.stft_real']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sigpro import get_primitives\n", - "\n", - "get_primitives()" - ] - }, - { - "cell_type": "markdown", - "id": "5b23aff6", - "metadata": {}, - "source": [ - "Suppose we are interested in finding the amplitude mean for each month of readings in the signal. We first specify the `name` and respective `primitive` we want to apply for both `transformations` and `aggregations`.\n", - "\n", - "In this case, we are interested in an identity transformation and mean aggregation." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "961af0ef", - "metadata": {}, - "outputs": [], - "source": [ - "aggregations = [{\n", - " \"name\":\"mean\",\n", - " \"primitive\":\"sigpro.aggregations.amplitude.statistical.mean\"\n", - "}]\n", - "\n", - "transformations = [{\n", - " \"name\":\"fft\",\n", - " \"primitive\":\"sigpro.transformations.amplitude.identity.identity\"\n", - "}]" - ] - }, - { - "cell_type": "markdown", - "id": "a9a3f3a6", - "metadata": {}, - "source": [ - "We use `process_signals` function to accomplish our goal. We pass the following:\n", - "- `es`: the entityset we are working with.\n", - "- `signal_dataframe_name`: the name of the dataframe whether `pidata` or `scada`.\n", - "- `signal_column`: the name of the signal column in the dataframe.\n", - "- `window_size`: the size of the bin we want to process the signals over, e.g. each month.\n", - "- `replace_dataframe`: an indicator whether we want to replace the current dataframe or add it as a new one." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "bea94368", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/sarah/anaconda3/envs/Zephyr/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3474: RuntimeWarning: Mean of empty slice.\n", - " return _methods._mean(a, axis=axis, dtype=dtype,\n", - "/Users/sarah/anaconda3/envs/Zephyr/lib/python3.8/site-packages/numpy/core/_methods.py:189: RuntimeWarning: invalid value encountered in double_scalars\n", - " ret = ret.dtype.type(ret / rcount)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
_indexCOD_ELEMENTtimefft.mean.mean_value
0002022-01-319872
1102022-02-28<NA>
2202022-03-31559
\n", - "
" - ], - "text/plain": [ - " _index COD_ELEMENT time fft.mean.mean_value\n", - "0 0 0 2022-01-31 9872\n", - "1 1 0 2022-02-28 \n", - "2 2 0 2022-03-31 559" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from zephyr_ml.feature_engineering import process_signals\n", - "\n", - "process_signals(es=pidata_es, \n", - " signal_dataframe_name='pidata', \n", - " signal_column='val1', \n", - " transformations=transformations, \n", - " aggregations=aggregations,\n", - " window_size='1m', \n", - " replace_dataframe=False)\n", - "\n", - "pidata_es['pidata_processed']" - ] - }, - { - "cell_type": "markdown", - "id": "fd88812a", - "metadata": {}, - "source": [ - "Based on our original observations of `val1`, we now have `pidata_processed` with an entry for each month and the respective mean value of observations we see in that month.\n", - "\n", - "**Note**: in the months we don't have observations, the value becomes null." - ] } ], "metadata": { diff --git a/setup.py b/setup.py index a3c1b35..f1996bc 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ 'pytest-cov>=2.6.0', 'jupyter>=1.0.0,<2', 'rundoc>=0.4.3,<0.5', + 'invoke', ] development_requires = [ diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..31ffebf --- /dev/null +++ b/tasks.py @@ -0,0 +1,149 @@ +import glob +import operator +import os +import platform +import re +import shutil +import stat +from pathlib import Path + +from invoke import task + + +COMPARISONS = { + '>=': operator.ge, + '>': operator.gt, + '<': operator.lt, + '<=': operator.le +} + + +@task +def pytest(c): + c.run('python -m pytest --cov=zephyr_ml') + + +@task +def install_minimum(c): + with open('setup.py', 'r') as setup_py: + lines = setup_py.read().splitlines() + + versions = [] + started = False + for line in lines: + if started: + if line == ']': + started = False + continue + + line = line.strip() + if line.startswith('#'): # ignore comment + continue + + # get specific package version based on declared python version + if 'python_version' in line: + python_version = re.search(r"python_version(<=?|>=?)\'(\d\.?)+\'", line) + operation = python_version.group(1) + version_number = python_version.group(0).split(operation)[-1].replace("'", "") + if COMPARISONS[operation](platform.python_version(), version_number): + line = line.split(";")[0] + else: + continue + + line = re.sub(r"""['",]""", '', line) + line = re.sub(r'>=?', '==', line) + if '==' in line: + line = re.sub(r',?<=?[\d.]*,?', '', line) + + elif re.search(r',?<=?[\d.]*,?', line): + line = f"'{line}'" + + versions.append(line) + + elif line.startswith('install_requires = ['): + started = True + + c.run(f'python -m pip install {" ".join(versions)}') + + +@task +def minimum(c): + install_minimum(c) + c.run('python -m pip check') + c.run('python -m pytest') + + +@task +def readme(c): + test_path = Path('tests/readme_test') + if test_path.exists() and test_path.is_dir(): + shutil.rmtree(test_path) + + cwd = os.getcwd() + os.makedirs(test_path, exist_ok=True) + shutil.copy('README.md', test_path / 'README.md') + shutil.copytree('notebooks/data', test_path / 'notebooks/data') + os.chdir(test_path) + c.run('rundoc run --single-session python3 -t python3 README.md') + os.chdir(cwd) + shutil.rmtree(test_path) + + +@task +def tutorials(c): + for ipynb_file in glob.glob('notebooks/*.ipynb'): + if '.ipynb_checkpoints' not in ipynb_file: + c.run(( + 'jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 ' + f'--to=html --stdout {ipynb_file}' + ), hide='out') + + +@task +def lint(c): + c.run('flake8 zephyr_ml tests') + c.run('isort -c --recursive zephyr_ml tests') + + +@task +def checkdeps(c, path): + with open('setup.py', 'r') as setup_py: + lines = setup_py.read().splitlines() + + packages = [] + started = False + for line in lines: + if started: + if line == ']': + started = False + continue + + line = line.strip() + if line.startswith('#') or not line: # ignore comment + continue + + line = re.split(r'>?=? {path}' + ) + + +def remove_readonly(func, path, _): + "Clear the readonly bit and reattempt the removal" + os.chmod(path, stat.S_IWRITE) + func(path) + + +@task +def rmdir(c, path): + try: + shutil.rmtree(path, onerror=remove_readonly) + except PermissionError: + pass \ No newline at end of file