diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9511644..66e4715 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -37,7 +37,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install package and dependencies - run: pip install rundoc . + run: pip install invoke rundoc . - name: make test-readme run: make test-readme @@ -46,7 +46,7 @@ jobs: strategy: matrix: python-version: [3.7, 3.8] - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} @@ -54,7 +54,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install package and dependencies - run: pip install .[test] + run: pip install invoke .[test] - name: make test-unit run: make test-unit @@ -63,7 +63,7 @@ jobs: strategy: matrix: python-version: [3.7, 3.8] - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} @@ -71,6 +71,6 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install package and dependencies - run: pip install .[test] + run: pip install invoke .[test] - name: make test-tutorials run: make test-tutorials \ No newline at end of file diff --git a/Makefile b/Makefile index 89530a1..d81ae6d 100644 --- a/Makefile +++ b/Makefile @@ -84,8 +84,7 @@ install-develop: clean-build clean-pyc ## install the package in editable mode a .PHONY: lint lint: ## check style with flake8 and isort - flake8 zephyr_ml tests - isort -c --recursive zephyr_ml tests + invoke lint .PHONY: fix-lint fix-lint: ## fix lint issues using autoflake, autopep8, and isort @@ -97,21 +96,15 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort .PHONY: test-unit test-unit: ## run tests quickly with the default Python - python -m pytest --cov=zephyr_ml + invoke pytest .PHONY: test-readme test-readme: ## run the readme snippets - rm -rf tests/readme_test && mkdir -p tests/readme_test/notebooks - cp -r notebooks/data tests/readme_test/notebooks/ - cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md - rm -rf tests/readme_test - + invoke readme .PHONY: test-tutorials test-tutorials: ## run the tutorial notebooks - find notebooks -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \ - jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --to=html --stdout {} > /dev/null \; - + invoke tutorials .PHONY: test test: test-unit test-readme test-tutorials ## test everything that needs test dependencies diff --git a/notebooks/feature_engineering.ipynb b/notebooks/feature_engineering.ipynb index b4fb48f..ea3c726 100644 --- a/notebooks/feature_engineering.ipynb +++ b/notebooks/feature_engineering.ipynb @@ -228,6 +228,24 @@ "pidata_es" ] }, + { + "cell_type": "markdown", + "id": "ffe532a6", + "metadata": {}, + "source": [ + "To visualize the entityset and its relationships, use `.plot` functionality." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cc618e40", + "metadata": {}, + "outputs": [], + "source": [ + "# pidata_es.plot('viz.png')" + ] + }, { "cell_type": "markdown", "id": "4746e7e1", @@ -239,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "e0ee16eb", "metadata": {}, "outputs": [ @@ -285,7 +303,7 @@ "0 0 2022-01-01 45801.0" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -299,12 +317,208 @@ "label_times" ] }, + { + "cell_type": "markdown", + "id": "ab8eefd3", + "metadata": {}, + "source": [ + "## 3) Feature Engineering with SigPro\n", + "\n", + "Process signals with [SigPro](https://github.com/sintel-dev/SigPro) for PI signals or SCADA signals.\n", + "\n", + "Processing signals is done by specifying the `transformations` and `aggregations` we wish to apply to the data. To look at some of the primitives readily available, we use `get_primitives` function from `SigPro`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "191a123a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sigpro.SigPro',\n", + " 'sigpro.aggregations.amplitude.statistical.crest_factor',\n", + " 'sigpro.aggregations.amplitude.statistical.kurtosis',\n", + " 'sigpro.aggregations.amplitude.statistical.mean',\n", + " 'sigpro.aggregations.amplitude.statistical.rms',\n", + " 'sigpro.aggregations.amplitude.statistical.skew',\n", + " 'sigpro.aggregations.amplitude.statistical.std',\n", + " 'sigpro.aggregations.amplitude.statistical.var',\n", + " 'sigpro.aggregations.frequency.band.band_mean',\n", + " 'sigpro.transformations.amplitude.identity.identity',\n", + " 'sigpro.transformations.amplitude.spectrum.power_spectrum',\n", + " 'sigpro.transformations.frequency.band.frequency_band',\n", + " 'sigpro.transformations.frequency.fft.fft',\n", + " 'sigpro.transformations.frequency.fft.fft_real',\n", + " 'sigpro.transformations.frequency_time.stft.stft',\n", + " 'sigpro.transformations.frequency_time.stft.stft_real']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sigpro import get_primitives\n", + "\n", + "get_primitives()" + ] + }, + { + "cell_type": "markdown", + "id": "5b23aff6", + "metadata": {}, + "source": [ + "Suppose we are interested in finding the amplitude mean for each month of readings in the signal. We first specify the `name` and respective `primitive` we want to apply for both `transformations` and `aggregations`.\n", + "\n", + "In this case, we are interested in an identity transformation and mean aggregation." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "961af0ef", + "metadata": {}, + "outputs": [], + "source": [ + "aggregations = [{\n", + " \"name\":\"mean\",\n", + " \"primitive\":\"sigpro.aggregations.amplitude.statistical.mean\"\n", + "}]\n", + "\n", + "transformations = [{\n", + " \"name\":\"fft\",\n", + " \"primitive\":\"sigpro.transformations.amplitude.identity.identity\"\n", + "}]" + ] + }, + { + "cell_type": "markdown", + "id": "a9a3f3a6", + "metadata": {}, + "source": [ + "We use `process_signals` function to accomplish our goal. We pass the following:\n", + "- `es`: the entityset we are working with.\n", + "- `signal_dataframe_name`: the name of the dataframe whether `pidata` or `scada`.\n", + "- `signal_column`: the name of the signal column in the dataframe.\n", + "- `window_size`: the size of the bin we want to process the signals over, e.g. each month.\n", + "- `replace_dataframe`: an indicator whether we want to replace the current dataframe or add it as a new one." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bea94368", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/Zephyr/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3474: RuntimeWarning: Mean of empty slice.\n", + " return _methods._mean(a, axis=axis, dtype=dtype,\n", + "/Users/sarah/anaconda3/envs/Zephyr/lib/python3.8/site-packages/numpy/core/_methods.py:189: RuntimeWarning: invalid value encountered in double_scalars\n", + " ret = ret.dtype.type(ret / rcount)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + " | _index | \n", + "COD_ELEMENT | \n", + "time | \n", + "fft.mean.mean_value | \n", + "
---|---|---|---|---|
0 | \n", + "0 | \n", + "0 | \n", + "2022-01-31 | \n", + "9872 | \n", + "
1 | \n", + "1 | \n", + "0 | \n", + "2022-02-28 | \n", + "<NA> | \n", + "
2 | \n", + "2 | \n", + "0 | \n", + "2022-03-31 | \n", + "559 | \n", + "
1 rows × 44 columns
\n", + "1 rows × 48 columns
\n", "" ], "text/plain": [ @@ -549,10 +767,10 @@ "COD_ELEMENT time \n", "0 2022-01-01 3.0 45801.0 \n", "\n", - "[1 rows x 44 columns]" + "[1 rows x 48 columns]" ] }, - "execution_count": 6, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -560,202 +778,6 @@ "source": [ "feature_matrix" ] - }, - { - "cell_type": "markdown", - "id": "ab8eefd3", - "metadata": {}, - "source": [ - "## 3) Feature Engineering with SigPro\n", - "\n", - "Process signals with [SigPro](https://github.com/sintel-dev/SigPro) for PI signals or SCADA signals.\n", - "\n", - "Processing signals is done by specifying the `transformations` and `aggregations` we wish to apply to the data. To look at some of the primitives readily available, we use `get_primitives` function from `SigPro`." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "191a123a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['sigpro.SigPro',\n", - " 'sigpro.aggregations.amplitude.statistical.crest_factor',\n", - " 'sigpro.aggregations.amplitude.statistical.kurtosis',\n", - " 'sigpro.aggregations.amplitude.statistical.mean',\n", - " 'sigpro.aggregations.amplitude.statistical.rms',\n", - " 'sigpro.aggregations.amplitude.statistical.skew',\n", - " 'sigpro.aggregations.amplitude.statistical.std',\n", - " 'sigpro.aggregations.amplitude.statistical.var',\n", - " 'sigpro.aggregations.frequency.band.band_mean',\n", - " 'sigpro.transformations.amplitude.identity.identity',\n", - " 'sigpro.transformations.amplitude.spectrum.power_spectrum',\n", - " 'sigpro.transformations.frequency.band.frequency_band',\n", - " 'sigpro.transformations.frequency.fft.fft',\n", - " 'sigpro.transformations.frequency.fft.fft_real',\n", - " 'sigpro.transformations.frequency_time.stft.stft',\n", - " 'sigpro.transformations.frequency_time.stft.stft_real']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sigpro import get_primitives\n", - "\n", - "get_primitives()" - ] - }, - { - "cell_type": "markdown", - "id": "5b23aff6", - "metadata": {}, - "source": [ - "Suppose we are interested in finding the amplitude mean for each month of readings in the signal. We first specify the `name` and respective `primitive` we want to apply for both `transformations` and `aggregations`.\n", - "\n", - "In this case, we are interested in an identity transformation and mean aggregation." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "961af0ef", - "metadata": {}, - "outputs": [], - "source": [ - "aggregations = [{\n", - " \"name\":\"mean\",\n", - " \"primitive\":\"sigpro.aggregations.amplitude.statistical.mean\"\n", - "}]\n", - "\n", - "transformations = [{\n", - " \"name\":\"fft\",\n", - " \"primitive\":\"sigpro.transformations.amplitude.identity.identity\"\n", - "}]" - ] - }, - { - "cell_type": "markdown", - "id": "a9a3f3a6", - "metadata": {}, - "source": [ - "We use `process_signals` function to accomplish our goal. We pass the following:\n", - "- `es`: the entityset we are working with.\n", - "- `signal_dataframe_name`: the name of the dataframe whether `pidata` or `scada`.\n", - "- `signal_column`: the name of the signal column in the dataframe.\n", - "- `window_size`: the size of the bin we want to process the signals over, e.g. each month.\n", - "- `replace_dataframe`: an indicator whether we want to replace the current dataframe or add it as a new one." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "bea94368", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/sarah/anaconda3/envs/Zephyr/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3474: RuntimeWarning: Mean of empty slice.\n", - " return _methods._mean(a, axis=axis, dtype=dtype,\n", - "/Users/sarah/anaconda3/envs/Zephyr/lib/python3.8/site-packages/numpy/core/_methods.py:189: RuntimeWarning: invalid value encountered in double_scalars\n", - " ret = ret.dtype.type(ret / rcount)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " | _index | \n", - "COD_ELEMENT | \n", - "time | \n", - "fft.mean.mean_value | \n", - "
---|---|---|---|---|
0 | \n", - "0 | \n", - "0 | \n", - "2022-01-31 | \n", - "9872 | \n", - "
1 | \n", - "1 | \n", - "0 | \n", - "2022-02-28 | \n", - "<NA> | \n", - "
2 | \n", - "2 | \n", - "0 | \n", - "2022-03-31 | \n", - "559 | \n", - "