diff --git a/koza/io/reader/csv_reader.py b/koza/io/reader/csv_reader.py index cd556b6..9d1984f 100644 --- a/koza/io/reader/csv_reader.py +++ b/koza/io/reader/csv_reader.py @@ -151,6 +151,7 @@ def __next__(self) -> Dict[str, Any]: else: self.fieldnames = self.field_type_map.keys() + try: row = next(self.reader) except StopIteration: @@ -171,25 +172,32 @@ def __next__(self) -> Dict[str, Any]: # to determine what to do here fields_len = len(self.fieldnames) row_len = len(row) + + # if we've made it here we can convert a row to a dict + field_map = dict(zip(self.fieldnames, row)) + if fields_len > row_len: - LOG.warning( + raise ValueError( f"CSV file {self.name} has {fields_len - row_len} fewer columns at {self.reader.line_num}" ) - elif row_len > fields_len: + + elif fields_len < row_len: LOG.warning( f"CSV file {self.name} has {row_len - fields_len} extra columns at {self.reader.line_num}" ) - - # if we've made it here we can convert a row to a dict - field_map = dict(zip(self.fieldnames, row)) + # Not sure if this would serve a purpose + # + # if not 'extra_cols' in self.field_type_map: + # # Create a type map for extra columns + # self.field_type_map['extra_cols'] = FieldType.str + # field_map['extra_cols'] = row[fields_len:] typed_field_map = {} for field, field_value in field_map.items(): - # This is really unreadable - malkovich malkovich # Take the value and coerce it using self.field_type_map (field: FieldType) # FIELD_TYPE is map of the field_type enum to the python - # built-in type or custom extras defined in koza + # to built-in type or custom extras defined in the source config try: typed_field_map[field] = FIELDTYPE_CLASS[self.field_type_map[field]](field_value) except KeyError as key_error: diff --git a/koza/model/config/source_config.py b/koza/model/config/source_config.py index 97410e3..da0bf98 100644 --- a/koza/model/config/source_config.py +++ b/koza/model/config/source_config.py @@ -87,24 +87,24 @@ class FieldType(str, Enum): str = 'str' int = 'int' float = 'float' - # Proportion = 'Proportion' class OutputFormat(str, Enum): """ - Have this set up but for prototyping removing this - as an option to only support the TSV output format + Output formats """ - tsv = 'tsv' - json = 'json' + tsv = 'tsv' # TODO jsonl = 'jsonl' + kgx = 'kgx' class TransformMode(str, Enum): """ - Have this set up but for prototyping removing this - as an option to only support the TSV output format + Configures how an external transform file is processed + flat uses importlib and watches for a StopIteration + exception, loop runs the code once and expects that + a for loop is being used to iterate over a file """ flat = 'flat' @@ -128,6 +128,7 @@ class DatasetDescription: These currently do not serve a purpose in koza other than documentation """ + id: str = None # TODO constrain to a curie? name: str = None # If empty use source name ingest_title: str = None # Map to biolink name @@ -303,6 +304,10 @@ def field_type_map(self): @dataclass(config=PydanticConfig) class PrimaryFileConfig(SourceConfig): + """ + node_properties and edge_properties are used for configuring + the KGX writer + """ node_properties: List[str] = None edge_properties: List[str] = None depends_on: List[str] = field(default_factory=list)