Skip to content

Commit

Permalink
Merge pull request #92 from /issues/89
Browse files Browse the repository at this point in the history
Ingest data even if only image CSVs are present
  • Loading branch information
mcquin authored Jan 24, 2018
2 parents 9a6b198 + 8a36b56 commit 1a98b4d
Show file tree
Hide file tree
Showing 11 changed files with 212 additions and 156 deletions.
25 changes: 10 additions & 15 deletions cytominer_database/commands/command_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,29 +39,24 @@
"--munge/--no-munge",
default=True,
help="""\
True if the CSV files for individual compartments
have been merged into a single CSV file. If True,
the CSV will be split into one CSV per compartment.
True if the CSV files for individual compartments \
have been merged into a single CSV file; \
the CSV will be split into one CSV per compartment \
(Default: true).
"""
)
@click.option(
"--skip-image-prefix/--no-skip-image-prefix",
default=True,
help="""\
True if the prefix of image table name should be
excluded from the names of columns from per image
table e.g. use `Metadata_Plate` instead of
`Image_Metadata_Plate`.
True if the prefix of image table name should be \
excluded from the names of columns from per image \
table e.g. use `Metadata_Plate` instead of \
`Image_Metadata_Plate` (Default: true).
"""
)
def command(source, target, config_file, munge, skip_image_prefix):
config = configparser.ConfigParser()

with open(config_file, "r") as config_fd:
config.read_file(config_fd)

if munge:
cytominer_database.munge.munge(config=config, source=source)
cytominer_database.munge.munge(config_file=config_file, source=source)

cytominer_database.ingest.seed(source, target, config,
skip_image_prefix)
cytominer_database.ingest.seed(source, target, config_file, skip_image_prefix)
3 changes: 3 additions & 0 deletions cytominer_database/config/config_default.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[filenames]
image = Image.csv
experiment = Experiment.csv
17 changes: 6 additions & 11 deletions cytominer_database/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,8 @@
Example::
import configparser
import cytominer_database.ingest
config = configparser.ConfigParser()
with open(config_file, "r") as config_fd:
config.read_file(config_fd)
cytominer_database.ingest.seed(source, target, config)
"""

Expand Down Expand Up @@ -100,22 +94,23 @@ def into(input, output, name, identifier, skip_table_prefix=False):
odo.odo(source, "{}::{}".format(output, name), has_header=True, delimiter=",")


def seed(source, target, config, skip_image_prefix=True):
def seed(source, target, config_file, skip_image_prefix=True):
"""
Read CSV files into a database backend.
:param config: Configuration file.
:param config_file: Configuration file.
:param source: Directory containing subdirectories that contain CSV files.
:param target: Connection string for the database.
:param skip_image_prefix: True if the prefix of image table name should be excluded
from the names of columns from per image table
"""
config_file = cytominer_database.utils.read_config(config_file)

directories = sorted(list(cytominer_database.utils.find_directories(source)))

for directory in directories:
try:
patterns, image = cytominer_database.utils.validate_csv_set(config, directory)
patterns, image = cytominer_database.utils.validate_csv_set(config_file, directory)
except IOError as e:
click.echo(e)

Expand All @@ -124,11 +119,11 @@ def seed(source, target, config, skip_image_prefix=True):
with open(image, "rb") as document:
identifier = hashlib.md5(document.read()).hexdigest()

name, _ = os.path.splitext(config["filenames"]["image"])
name, _ = os.path.splitext(config_file["filenames"]["image"])

try:
into(input=image, output=target, name=name.capitalize(), identifier=identifier,
skip_table_prefix = skip_image_prefix)
skip_table_prefix=skip_image_prefix)
except sqlalchemy.exc.DatabaseError as e:
click.echo(e)

Expand Down
20 changes: 12 additions & 8 deletions cytominer_database/munge.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import logging
import os

import click
import pandas as pd

import cytominer_database.utils

logger = logging.getLogger(__name__)

def munge(config, source, target=None):

def munge(config_file, source, target=None):
"""
Searches ``source`` for directories containing a CSV file corresponding to
per-object measurements, then splits the CSV file into one CSV file per compartment.
Expand All @@ -15,7 +18,7 @@ def munge(config, source, target=None):
Cytoplasm, and Nuclei. ``munge`` will split this CSV file into 3 CSV files:
Cells.csv, Cytoplasm.csv, and Nuclei.csv.
:param config: Configuration file.
:param config_file: Configuration file.
:param source: Directory containing subdirectories that contain an object CSV file.
Expand All @@ -25,24 +28,25 @@ def munge(config, source, target=None):
Example::
import configparser
import cytominer_database.munge
config = configparser.ConfigParser()
with open(config_file, "r") as config_fd:
config.read_file(config_fd)
cytominer_database.munge.munge(source, target, config)
"""

config = cytominer_database.utils.read_config(config_file)

if not target:
target = source

directories = sorted(list(cytominer_database.utils.find_directories(source)))

valid_directories = [] # list of subdirectories that have an object CSV file.

if not config.has_option("filenames", "object"):
logger.warn("No object CSV configured, skipping `munge`.")

return valid_directories

for directory in directories:
try:
obj = pd.read_csv(os.path.join(directory, config["filenames"]["object"]), header=[0, 1])
Expand Down
56 changes: 51 additions & 5 deletions cytominer_database/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
import glob
import logging
import os
import pkg_resources
import tempfile
import warnings

import configparser
import csvkit.utilities.csvclean

# csvkit (or a dependency of csvkit) mucks with warning levels.
Expand Down Expand Up @@ -91,11 +93,7 @@ def validate_csv_set(config, directory):
if not os.path.isfile(image_csv):
raise IOError("{} not found in {}. Skipping.".format(config["filenames"]["image"], directory))

pattern_csvs = [filename for filename in glob.glob(os.path.join(directory, '*.csv')) if filename not in [
os.path.join(directory, config['filenames']['image']),
os.path.join(directory, config['filenames']['object']),
os.path.join(directory, config['filenames']['experiment'])
]]
pattern_csvs = collect_csvs(config, directory)

filenames = pattern_csvs + [image_csv]

Expand All @@ -106,3 +104,51 @@ def validate_csv_set(config, directory):
raise IOError("Some files were invalid: {}. Skipping {}.".format(invalid_files, directory))

return pattern_csvs, image_csv


def collect_csvs(config, directory):
"""
Collect CSV files from a directory.
This function collects CSV files in a directory, excluding those that have been specified in the configuration file.
This enables collecting only those CSV files that correspond to cellular compartments. e.g. Cells.csv, Cytoplasm.csv,
Nuclei.csv. CSV files corresponding to experiment, image, or object will be excluded.
:param config: configuration file.
:param directory: directory containing the CSV files.
:return: a list of CSV files.
"""
config_filenames = []

for filename_option in ["experiment", "image", "object"]:
if config.has_option("filenames", filename_option):
config_filenames.append(os.path.join(directory, config["filenames"][filename_option]))

filenames = glob.glob(os.path.join(directory, "*.csv"))

return [filename for filename in filenames if filename not in config_filenames]


def read_config(filename):
"""
Read a configuration file.
:param filename: configuration filename
:return: a configuration object
"""
config = configparser.ConfigParser()

for config_filename in [
pkg_resources.resource_filename("cytominer_database", "config/config_default.ini"), # default config file
filename
]:
try:
with open(config_filename, "r") as fd:
config.read_file(fd)
except IOError as e:
logger.warn("Unable to read configuration file: {}.".format(config_filename))

return config
73 changes: 23 additions & 50 deletions tests/commands/test_command_ingest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import os

import click.testing
import configparser
import backports.tempfile
import odo
import pandas
import pkg_resources
import pandas as pd
import pytest

import cytominer_database.command
Expand All @@ -23,12 +21,10 @@ def test_help(runner):


def test_run(dataset, runner):
config_file = os.path.join(dataset["data_dir"], "config.ini")
opts = ["ingest"]

opts = [
"ingest",
"--config-file", config_file
]
if dataset["config"]:
opts += ["--config-file", os.path.join(dataset["data_dir"], dataset["config"])]

if dataset["munge"]:
opts += ["--munge"]
Expand All @@ -44,32 +40,23 @@ def test_run(dataset, runner):

result = runner.invoke(cytominer_database.command.command, opts)

assert result.exit_code == 0

config = configparser.ConfigParser()

with open(config_file, "r") as config_fd:
config.read_file(config_fd)

for (k, v) in dict({"cells": "Cells.csv", "cytoplasm": "Cytoplasm.csv", "nuclei": "Nuclei.csv"}).items():
config["filenames"][k] = v
assert result.exit_code == 0, result.output

for table_key in ["image", "cells", "cytoplasm", "nuclei"]:
csv_filename = os.path.join(temp_dir, config["filenames"][table_key])
for blob in dataset["ingest"]:
table_name = blob["table"]

table_name = config["filenames"][table_key].split(".")[0]
csv_pathname = os.path.join(temp_dir, "{}.csv".format(table_name))

odo.odo("sqlite:///{}::{}".format(str(sqlite_file), table_name), csv_filename)
odo.odo("sqlite:///{}::{}".format(str(sqlite_file), table_name), csv_pathname)

df = pandas.read_csv(csv_filename)
df = pd.read_csv(csv_pathname)

assert df.shape[0] == dataset["ingest"]["{}_nrows".format(table_name)]
assert df.shape[0] == blob["nrows"]

assert df.shape[1] == dataset["ingest"]["{}_ncols".format(table_name)] + 1
assert df.shape[1] == blob["ncols"] + 1

if table_key != "image":
assert df.groupby(["TableNumber", "ImageNumber"]).size().sum() == \
dataset["ingest"]["{}_nrows".format(table_name)]
if table_name.lower() != "image":
assert df.groupby(["TableNumber", "ImageNumber"]).size().sum() == blob["nrows"]


def test_run_defaults(cellpainting, runner):
Expand All @@ -91,32 +78,18 @@ def test_run_defaults(cellpainting, runner):

assert result.exit_code == 0

config = configparser.ConfigParser()

config_file = pkg_resources.resource_filename(
"cytominer_database",
os.path.join("config", "config_cellpainting.ini")
)

with open(config_file, "r") as config_fd:
config.read_file(config_fd)

for (k, v) in dict({"cells": "Cells.csv", "cytoplasm": "Cytoplasm.csv", "nuclei": "Nuclei.csv"}).items():
config["filenames"][k] = v

for table_key in ["image", "cells", "cytoplasm", "nuclei"]:
csv_filename = os.path.join(temp_dir, config["filenames"][table_key])
for blob in cellpainting["ingest"]:
table_name = blob["table"]

table_name = config["filenames"][table_key].split(".")[0]
csv_pathname = os.path.join(temp_dir, "{}.csv".format(table_name))

odo.odo("sqlite:///{}::{}".format(str(sqlite_file), table_name), csv_filename)
odo.odo("sqlite:///{}::{}".format(str(sqlite_file), table_name), csv_pathname)

df = pandas.read_csv(csv_filename)
df = pd.read_csv(csv_pathname)

assert df.shape[0] == cellpainting["ingest"]["{}_nrows".format(table_name)]
assert df.shape[0] == blob["nrows"]

assert df.shape[1] == cellpainting["ingest"]["{}_ncols".format(table_name)] + 1
assert df.shape[1] == blob["ncols"] + 1

if table_key != "image":
assert df.groupby(["TableNumber", "ImageNumber"]).size().sum() == \
cellpainting["ingest"]["{}_nrows".format(table_name)]
if table_name.lower() != "image":
assert df.groupby(["TableNumber", "ImageNumber"]).size().sum() == blob["nrows"]
Loading

0 comments on commit 1a98b4d

Please sign in to comment.