Skip to content

Commit

Permalink
docstring cleanup; csv futzing
Browse files Browse the repository at this point in the history
  • Loading branch information
kshefchek committed Sep 27, 2021
1 parent efefe50 commit 1cc9b48
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 14 deletions.
22 changes: 15 additions & 7 deletions koza/io/reader/csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def __next__(self) -> Dict[str, Any]:

else:
self.fieldnames = self.field_type_map.keys()

try:
row = next(self.reader)
except StopIteration:
Expand All @@ -171,25 +172,32 @@ def __next__(self) -> Dict[str, Any]:
# to determine what to do here
fields_len = len(self.fieldnames)
row_len = len(row)

# if we've made it here we can convert a row to a dict
field_map = dict(zip(self.fieldnames, row))

if fields_len > row_len:
LOG.warning(
raise ValueError(
f"CSV file {self.name} has {fields_len - row_len} fewer columns at {self.reader.line_num}"
)
elif row_len > fields_len:

elif fields_len < row_len:
LOG.warning(
f"CSV file {self.name} has {row_len - fields_len} extra columns at {self.reader.line_num}"
)

# if we've made it here we can convert a row to a dict
field_map = dict(zip(self.fieldnames, row))
# Not sure if this would serve a purpose
#
# if not 'extra_cols' in self.field_type_map:
# # Create a type map for extra columns
# self.field_type_map['extra_cols'] = FieldType.str
# field_map['extra_cols'] = row[fields_len:]

typed_field_map = {}

for field, field_value in field_map.items():
# This is really unreadable - malkovich malkovich
# Take the value and coerce it using self.field_type_map (field: FieldType)
# FIELD_TYPE is map of the field_type enum to the python
# built-in type or custom extras defined in koza
# to built-in type or custom extras defined in the source config
try:
typed_field_map[field] = FIELDTYPE_CLASS[self.field_type_map[field]](field_value)
except KeyError as key_error:
Expand Down
19 changes: 12 additions & 7 deletions koza/model/config/source_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,24 +87,24 @@ class FieldType(str, Enum):
str = 'str'
int = 'int'
float = 'float'
# Proportion = 'Proportion'


class OutputFormat(str, Enum):
"""
Have this set up but for prototyping removing this
as an option to only support the TSV output format
Output formats
"""

tsv = 'tsv'
json = 'json'
tsv = 'tsv' # TODO
jsonl = 'jsonl'
kgx = 'kgx'


class TransformMode(str, Enum):
"""
Have this set up but for prototyping removing this
as an option to only support the TSV output format
Configures how an external transform file is processed
flat uses importlib and watches for a StopIteration
exception, loop runs the code once and expects that
a for loop is being used to iterate over a file
"""

flat = 'flat'
Expand All @@ -128,6 +128,7 @@ class DatasetDescription:
These currently do not serve a purpose in koza other
than documentation
"""

id: str = None # TODO constrain to a curie?
name: str = None # If empty use source name
ingest_title: str = None # Map to biolink name
Expand Down Expand Up @@ -303,6 +304,10 @@ def field_type_map(self):

@dataclass(config=PydanticConfig)
class PrimaryFileConfig(SourceConfig):
"""
node_properties and edge_properties are used for configuring
the KGX writer
"""
node_properties: List[str] = None
edge_properties: List[str] = None
depends_on: List[str] = field(default_factory=list)
Expand Down

0 comments on commit 1cc9b48

Please sign in to comment.