Merge pull request #92 from /issues/89

Ingest data even if only image CSVs are present
cytomining · Jan 24, 2018 · 1a98b4d · 1a98b4d
2 parents 9a6b198 + 8a36b56
commit 1a98b4d
Show file tree

Hide file tree

Showing 11 changed files with 212 additions and 156 deletions.
diff --git a/cytominer_database/commands/command_ingest.py b/cytominer_database/commands/command_ingest.py
@@ -39,29 +39,24 @@
     "--munge/--no-munge",
     default=True,
     help="""\
-True if the CSV files for individual compartments
-have been merged into a single CSV file. If True,
-the CSV will be split into one CSV per compartment.
+True if the CSV files for individual compartments \
+have been merged into a single CSV file; \
+the CSV will be split into one CSV per compartment \
+(Default: true).
 """
 )
 @click.option(
     "--skip-image-prefix/--no-skip-image-prefix",
     default=True,
     help="""\
-True if the prefix of image table name should be
-excluded from the names of columns from per image
-table e.g. use  `Metadata_Plate` instead of
-`Image_Metadata_Plate`.
+True if the prefix of image table name should be \
+excluded from the names of columns from per image \
+table e.g. use  `Metadata_Plate` instead of \
+`Image_Metadata_Plate` (Default: true).
 """
 )
 def command(source, target, config_file, munge, skip_image_prefix):
-    config = configparser.ConfigParser()
-
-    with open(config_file, "r") as config_fd:
-        config.read_file(config_fd)
-
     if munge:
-        cytominer_database.munge.munge(config=config, source=source)
+        cytominer_database.munge.munge(config_file=config_file, source=source)
 
-    cytominer_database.ingest.seed(source, target, config,
-        skip_image_prefix)
+    cytominer_database.ingest.seed(source, target, config_file, skip_image_prefix)
diff --git a/cytominer_database/config/config_default.ini b/cytominer_database/config/config_default.ini
@@ -0,0 +1,3 @@
+[filenames]
+image = Image.csv
+experiment = Experiment.csv
diff --git a/cytominer_database/ingest.py b/cytominer_database/ingest.py
@@ -32,14 +32,8 @@
 
 Example::
 
-    import configparser
     import cytominer_database.ingest
 
-    config = configparser.ConfigParser()
-
-    with open(config_file, "r") as config_fd:
-        config.read_file(config_fd)
-
     cytominer_database.ingest.seed(source, target, config)
 """
 
@@ -100,22 +94,23 @@ def into(input, output, name, identifier, skip_table_prefix=False):
             odo.odo(source, "{}::{}".format(output, name), has_header=True, delimiter=",")
 
 
-def seed(source, target, config, skip_image_prefix=True):
+def seed(source, target, config_file, skip_image_prefix=True):
     """
     Read CSV files into a database backend.
 
-    :param config: Configuration file.
+    :param config_file: Configuration file.
     :param source: Directory containing subdirectories that contain CSV files.
     :param target: Connection string for the database.
     :param skip_image_prefix: True if the prefix of image table name should be excluded
      from the names of columns from per image table
     """
+    config_file = cytominer_database.utils.read_config(config_file)
 
     directories = sorted(list(cytominer_database.utils.find_directories(source)))
 
     for directory in directories:
         try:
-            patterns, image = cytominer_database.utils.validate_csv_set(config, directory)
+            patterns, image = cytominer_database.utils.validate_csv_set(config_file, directory)
         except IOError as e:
             click.echo(e)
 
@@ -124,11 +119,11 @@ def seed(source, target, config, skip_image_prefix=True):
         with open(image, "rb") as document:
             identifier = hashlib.md5(document.read()).hexdigest()
 
-        name, _ = os.path.splitext(config["filenames"]["image"])
+        name, _ = os.path.splitext(config_file["filenames"]["image"])
 
         try:
             into(input=image, output=target, name=name.capitalize(), identifier=identifier,
-                skip_table_prefix = skip_image_prefix)
+                 skip_table_prefix=skip_image_prefix)
         except sqlalchemy.exc.DatabaseError as e:
             click.echo(e)
 

diff --git a/cytominer_database/munge.py b/cytominer_database/munge.py
@@ -1,12 +1,15 @@
+import logging
 import os
 
 import click
 import pandas as pd
 
 import cytominer_database.utils
 
+logger = logging.getLogger(__name__)
 
-def munge(config, source, target=None):
+
+def munge(config_file, source, target=None):
     """
     Searches ``source`` for directories containing a CSV file corresponding to
     per-object measurements, then splits the CSV file into one CSV file per compartment.
@@ -15,7 +18,7 @@ def munge(config, source, target=None):
     Cytoplasm, and Nuclei. ``munge`` will split this CSV file into 3 CSV files:
     Cells.csv, Cytoplasm.csv, and Nuclei.csv.
 
-    :param config: Configuration file.
+    :param config_file: Configuration file.
 
     :param source: Directory containing subdirectories that contain an object CSV file.
 
@@ -25,24 +28,25 @@ def munge(config, source, target=None):
 
     Example::
 
-        import configparser
         import cytominer_database.munge
 
-        config = configparser.ConfigParser()
-
-        with open(config_file, "r") as config_fd:
-            config.read_file(config_fd)
-
         cytominer_database.munge.munge(source, target, config)
     """
 
+    config = cytominer_database.utils.read_config(config_file)
+
     if not target:
         target = source
 
     directories = sorted(list(cytominer_database.utils.find_directories(source)))
 
     valid_directories = []  # list of subdirectories that have an object CSV file.
 
+    if not config.has_option("filenames", "object"):
+        logger.warn("No object CSV configured, skipping `munge`.")
+
+        return valid_directories
+
     for directory in directories:
         try:
             obj = pd.read_csv(os.path.join(directory, config["filenames"]["object"]), header=[0, 1])

diff --git a/cytominer_database/utils.py b/cytominer_database/utils.py
@@ -2,9 +2,11 @@
 import glob
 import logging
 import os
+import pkg_resources
 import tempfile
 import warnings
 
+import configparser
 import csvkit.utilities.csvclean
 
 # csvkit (or a dependency of csvkit) mucks with warning levels.
@@ -91,11 +93,7 @@ def validate_csv_set(config, directory):
     if not os.path.isfile(image_csv):
         raise IOError("{} not found in {}. Skipping.".format(config["filenames"]["image"], directory))
 
-    pattern_csvs = [filename for filename in glob.glob(os.path.join(directory, '*.csv')) if filename not in [
-        os.path.join(directory, config['filenames']['image']),
-        os.path.join(directory, config['filenames']['object']),
-        os.path.join(directory, config['filenames']['experiment'])
-    ]]
+    pattern_csvs = collect_csvs(config, directory)
 
     filenames = pattern_csvs + [image_csv]
 
@@ -106,3 +104,51 @@ def validate_csv_set(config, directory):
         raise IOError("Some files were invalid: {}. Skipping {}.".format(invalid_files, directory))
 
     return pattern_csvs, image_csv
+
+
+def collect_csvs(config, directory):
+    """
+    Collect CSV files from a directory.
+
+    This function collects CSV files in a directory, excluding those that have been specified in the configuration file.
+    This enables collecting only those CSV files that correspond to cellular compartments. e.g. Cells.csv, Cytoplasm.csv,
+    Nuclei.csv. CSV files corresponding to experiment, image, or object will be excluded.
+
+    :param config: configuration file.
+    :param directory: directory containing the CSV files.
+
+    :return: a list of CSV files.
+
+    """
+    config_filenames = []
+
+    for filename_option in ["experiment", "image", "object"]:
+        if config.has_option("filenames", filename_option):
+            config_filenames.append(os.path.join(directory, config["filenames"][filename_option]))
+
+    filenames = glob.glob(os.path.join(directory, "*.csv"))
+
+    return [filename for filename in filenames if filename not in config_filenames]
+
+
+def read_config(filename):
+    """
+    Read a configuration file.
+
+    :param filename: configuration filename
+
+    :return: a configuration object
+    """
+    config = configparser.ConfigParser()
+
+    for config_filename in [
+        pkg_resources.resource_filename("cytominer_database", "config/config_default.ini"),  # default config file
+        filename
+    ]:
+        try:
+            with open(config_filename, "r") as fd:
+                config.read_file(fd)
+        except IOError as e:
+            logger.warn("Unable to read configuration file: {}.".format(config_filename))
+
+    return config
diff --git a/tests/commands/test_command_ingest.py b/tests/commands/test_command_ingest.py
@@ -1,11 +1,9 @@
 import os
 
 import click.testing
-import configparser
 import backports.tempfile
 import odo
-import pandas
-import pkg_resources
+import pandas as pd
 import pytest
 
 import cytominer_database.command
@@ -23,12 +21,10 @@ def test_help(runner):
 
 
 def test_run(dataset, runner):
-    config_file = os.path.join(dataset["data_dir"], "config.ini")
+    opts = ["ingest"]
 
-    opts = [
-        "ingest",
-        "--config-file", config_file
-    ]
+    if dataset["config"]:
+        opts += ["--config-file", os.path.join(dataset["data_dir"], dataset["config"])]
 
     if dataset["munge"]:
         opts += ["--munge"]
@@ -44,32 +40,23 @@ def test_run(dataset, runner):
 
         result = runner.invoke(cytominer_database.command.command, opts)
 
-        assert result.exit_code == 0
-
-        config = configparser.ConfigParser()
-
-        with open(config_file, "r") as config_fd:
-            config.read_file(config_fd)
-
-        for (k, v) in dict({"cells": "Cells.csv", "cytoplasm": "Cytoplasm.csv", "nuclei": "Nuclei.csv"}).items():
-            config["filenames"][k] = v
+        assert result.exit_code == 0, result.output
 
-        for table_key in ["image", "cells", "cytoplasm", "nuclei"]:
-            csv_filename = os.path.join(temp_dir, config["filenames"][table_key])
+        for blob in dataset["ingest"]:
+            table_name = blob["table"]
 
-            table_name = config["filenames"][table_key].split(".")[0]
+            csv_pathname = os.path.join(temp_dir, "{}.csv".format(table_name))
 
-            odo.odo("sqlite:///{}::{}".format(str(sqlite_file), table_name), csv_filename)
+            odo.odo("sqlite:///{}::{}".format(str(sqlite_file), table_name), csv_pathname)
 
-            df = pandas.read_csv(csv_filename)
+            df = pd.read_csv(csv_pathname)
 
-            assert df.shape[0] == dataset["ingest"]["{}_nrows".format(table_name)]
+            assert df.shape[0] == blob["nrows"]
 
-            assert df.shape[1] == dataset["ingest"]["{}_ncols".format(table_name)] + 1
+            assert df.shape[1] == blob["ncols"] + 1
 
-            if table_key != "image":
-                assert df.groupby(["TableNumber", "ImageNumber"]).size().sum() == \
-                       dataset["ingest"]["{}_nrows".format(table_name)]
+            if table_name.lower() != "image":
+                assert df.groupby(["TableNumber", "ImageNumber"]).size().sum() == blob["nrows"]
 
 
 def test_run_defaults(cellpainting, runner):
@@ -91,32 +78,18 @@ def test_run_defaults(cellpainting, runner):
 
         assert result.exit_code == 0
 
-        config = configparser.ConfigParser()
-
-        config_file = pkg_resources.resource_filename(
-            "cytominer_database",
-            os.path.join("config", "config_cellpainting.ini")
-        )
-
-        with open(config_file, "r") as config_fd:
-            config.read_file(config_fd)
-
-        for (k, v) in dict({"cells": "Cells.csv", "cytoplasm": "Cytoplasm.csv", "nuclei": "Nuclei.csv"}).items():
-            config["filenames"][k] = v
-
-        for table_key in ["image", "cells", "cytoplasm", "nuclei"]:
-            csv_filename = os.path.join(temp_dir, config["filenames"][table_key])
+        for blob in cellpainting["ingest"]:
+            table_name = blob["table"]
 
-            table_name = config["filenames"][table_key].split(".")[0]
+            csv_pathname = os.path.join(temp_dir, "{}.csv".format(table_name))
 
-            odo.odo("sqlite:///{}::{}".format(str(sqlite_file), table_name), csv_filename)
+            odo.odo("sqlite:///{}::{}".format(str(sqlite_file), table_name), csv_pathname)
 
-            df = pandas.read_csv(csv_filename)
+            df = pd.read_csv(csv_pathname)
 
-            assert df.shape[0] == cellpainting["ingest"]["{}_nrows".format(table_name)]
+            assert df.shape[0] == blob["nrows"]
 
-            assert df.shape[1] == cellpainting["ingest"]["{}_ncols".format(table_name)] + 1
+            assert df.shape[1] == blob["ncols"] + 1
 
-            if table_key != "image":
-                assert df.groupby(["TableNumber", "ImageNumber"]).size().sum() == \
-                       cellpainting["ingest"]["{}_nrows".format(table_name)]
+            if table_name.lower() != "image":
+                assert df.groupby(["TableNumber", "ImageNumber"]).size().sum() == blob["nrows"]