From 2924f5a713bb0c42dedbb89e031acab05b558e10 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Wed, 12 May 2021 11:11:49 -0600 Subject: [PATCH] making pyspark an optional dependency --- README.md | 9 ++++++++- examples/kafka_spark.py | 15 +++++++-------- examples/zeek_to_parquet_with_spark.py | 6 +++++- setup.py | 8 +++----- zat/__init__.py | 2 +- zat/log_to_sparkdf.py | 16 +++++++++++++--- 6 files changed, 37 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 7f81857..410e320 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,9 @@ with Pandas, scikit-learn, Kafka, and Spark ### Install ``` -$ pip install zat +pip install zat +pip install zat[pyspark] (includes pyspark library) +pip install zat[all] (include pyarrow, yara-python, and tldextract) ``` ### Getting Started @@ -59,6 +61,11 @@ from here to there. ### Documentation +#### Running the Tests +``` +pip install pytest coverage pytest-cov +pytest zat +``` ### About SuperCowPowers The company was formed so that its developers could follow their passion for Python, streaming data pipelines and having fun with data analysis. We also think cows are cool and should be superheros or at least carry around rayguns and burner phones. Visit SuperCowPowers diff --git a/examples/kafka_spark.py b/examples/kafka_spark.py index 33b12f9..7bdb53b 100644 --- a/examples/kafka_spark.py +++ b/examples/kafka_spark.py @@ -1,18 +1,17 @@ """Read Kafka Streams into Spark, perform simple filtering/aggregation""" import sys -import pyspark -from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StringType, BooleanType, IntegerType -from pyspark.sql.functions import from_json, to_json, col, struct, udf - import argparse from time import sleep -# Local imports -from zat.utils import signal_utils +try: + from pyspark.sql import SparkSession + from pyspark.sql.types import StructType, StringType, BooleanType, IntegerType + from pyspark.sql.functions import from_json, to_json, col, struct, udf +except ImportError: + print('\npip install pyspark') + sys.exit(1) -# Third Party Imports try: import tldextract except ImportError: diff --git a/examples/zeek_to_parquet_with_spark.py b/examples/zeek_to_parquet_with_spark.py index 56492ab..c8b041d 100644 --- a/examples/zeek_to_parquet_with_spark.py +++ b/examples/zeek_to_parquet_with_spark.py @@ -3,7 +3,11 @@ import os import sys import argparse -from pyspark.sql import SparkSession +try: + from pyspark.sql import SparkSession +except ImportError: + print('pip install pyspark') + sys.exit(1) # Local imports from zat import log_to_sparkdf diff --git a/setup.py b/setup.py index d3a1c3f..834719d 100644 --- a/setup.py +++ b/setup.py @@ -36,14 +36,12 @@ def get_files(dir_name): install_requires=[ 'requests', 'watchdog', - 'numpy', - 'scipy', 'pandas', - 'scikit-learn', - 'pyspark' + 'scikit-learn' ], extras_require={ - 'all': ['pyarrow', 'yara-python', 'tldextract'] + 'pyspark': ['pyspark'], + 'all': ['pyspark', 'pyarrow', 'yara-python', 'tldextract'] }, license='Apache', keywords='Zeek, Bro, Python, Networking, Security, Scikit-Learn, Spark, Kafka, Parquet', diff --git a/zat/__init__.py b/zat/__init__.py index 60f4b6c..b5a7dbd 100644 --- a/zat/__init__.py +++ b/zat/__init__.py @@ -1,3 +1,3 @@ __author__ = 'Brian Wylie' __email__ = 'briford@supercowpowers.com' -__version__ = '0.4.2' +__version__ = '0.4.3' diff --git a/zat/log_to_sparkdf.py b/zat/log_to_sparkdf.py index 28c0166..03eb14b 100644 --- a/zat/log_to_sparkdf.py +++ b/zat/log_to_sparkdf.py @@ -1,9 +1,14 @@ """LogToSparkDF: Converts a Zeek log to a Spark DataFrame""" +import sys # Third Party -from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, LongType -from pyspark.sql.functions import col, when +try: + from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, LongType + from pyspark.sql.functions import col, when +except ImportError: + print('\npip install pyspark') + # Local from zat import zeek_log_reader @@ -115,8 +120,13 @@ def build_spark_schema(self, column_names, column_types, verbose=False): def test(): """Test for LogToSparkDF Class""" import os + import pytest from zat.utils import file_utils - from pyspark.sql import SparkSession + + try: + from pyspark.sql import SparkSession + except ImportError: + pytest.skip('pip install pyspark') # Spin up a local Spark Session (with 4 executors) spark = SparkSession.builder.master('local[4]').appName('my_awesome').getOrCreate()