From e14251cebb8ebae968a0e7b7c8e68d6b5918dd93 Mon Sep 17 00:00:00 2001 From: Keith McHugh Date: Mon, 18 Sep 2023 18:58:30 -0700 Subject: [PATCH] Add utility for transforming Trino ROW type columns into Ruby hashes (#117) --- lib/trino/client/column_value_parser.rb | 115 ++++++++++++++++++++++ lib/trino/client/query.rb | 28 ++++++ spec/client_spec.rb | 61 ++++++++++-- spec/column_value_parser_spec.rb | 125 ++++++++++++++++++++++++ 4 files changed, 323 insertions(+), 6 deletions(-) create mode 100644 lib/trino/client/column_value_parser.rb create mode 100644 spec/column_value_parser_spec.rb diff --git a/lib/trino/client/column_value_parser.rb b/lib/trino/client/column_value_parser.rb new file mode 100644 index 00000000..cfc9dff2 --- /dev/null +++ b/lib/trino/client/column_value_parser.rb @@ -0,0 +1,115 @@ +module Trino::Client + class ColumnValueParser + INSIDE_MATCHING_PARENS_REGEX = /\((?>[^)(]+|\g<0>)*\)/ + + attr_reader :name, :type, :scalar_parser + + def initialize(column, scalar_parser = nil) + @name = column.name + @type = prepare_type_for_parsing(column.type) + @scalar_parser = scalar_parser + end + + # Public: Parse the value of a row's field by using its column's Trino type. + # Trino types can be scalars like VARCHAR and TIMESTAMP or complex types + # like ARRAY and ROW. ROW types are treated as objects. + # An ARRAY column's type is an array of types as you'd expect. A ROW + # column's type is a comma-separated list of space-separated (name, type) tuples. + # + # data - The value of a row's field. Can be a string, number, an array of those, + # or an arrays of arrays, etc. + # dtype - The Trino type string of the column. See above explanation. + # + # Returns: + # - The given value for strings and numbers + # - A Time for timestamps + # - A Hash of { field1 => value1, field2 => value2, ...etc } for row types + # - An array of the above for array types + def value(data, dtype = type) + # Convert Trino ARRAY elements into Ruby Arrays + if starts_with?(dtype, 'array(') + return parse_array_element(data, dtype) + + # Convert Trino ROW elements into Ruby Hashes + elsif starts_with?(dtype, 'row(') + return parse_row_element(data, dtype) + + # If defined, use scalar_parser to convert scalar types + elsif !scalar_parser.nil? + return scalar_parser.call(data, dtype) + end + + # Otherwise, values are returned unaltered + data + end + + private + + # Private: Remove quotation marks and handle recent versions of + # Trino having a 'with time zone' suffix on some fields that breaks + # out assumption that types don't have spaces in them. + # + # Returns a string. + def prepare_type_for_parsing(type) + type.gsub('"', '').gsub(' with time zone', '_with_time_zone') + end + + def parse_array_element(data, dtype) + # If the element is empty, return an empty array + return [] if blank?(data) + + # Inner data type will be the current dtype with `array(` and `)` chopped off + inner_dtype = dtype.match(INSIDE_MATCHING_PARENS_REGEX)[0][1..-2] + + data.map { |inner_data| value(inner_data, inner_dtype) } + end + + def parse_row_element(data, dtype) + # If the element is empty, return an empty object + return {} if blank?(data) + + parsed_row_element = {} + + inner_dtype = dtype.match(INSIDE_MATCHING_PARENS_REGEX)[0][1..-2] + elems = inner_dtype.split(' ') + num_elems_to_skip = 0 + field_position = 0 + + # Iterate over each datatype of the row and mutate parsed_row_element + # to have a key of the field name and value for that field's value. + elems.each_with_index do |field, i| + # We detected an array or row and are skipping all of the elements within it + # since its conversion was handled by calling `value` recursively. + if num_elems_to_skip.positive? + num_elems_to_skip -= 1 + next + end + + # Field names never have these characters and are never the last element. + next if field.include?(',') || field.include?('(') || field.include?(')') || i == elems.length - 1 + + type = elems[(i + 1)..].join(' ') + + # If this row has a nested array or row, the type of this field is that array or row's type. + if starts_with?(type, 'array(') || starts_with?(type, 'row(') + datatype = type.sub(/\(.*/, '') + type = "#{datatype}#{type.match(INSIDE_MATCHING_PARENS_REGEX)[0]}" + num_elems_to_skip = type.split(' ').length # see above comment about num_elems_to_skip + end + + parsed_row_element[field] = value(data[field_position], type) + field_position += 1 + end + + parsed_row_element + end + + def blank?(obj) + obj.respond_to?(:empty?) ? !!obj.empty? : !obj + end + + def starts_with?(str, prefix) + prefix.respond_to?(:to_str) && str[0, prefix.length] == prefix + end + end +end diff --git a/lib/trino/client/query.rb b/lib/trino/client/query.rb index c6e256ee..02d6ad39 100644 --- a/lib/trino/client/query.rb +++ b/lib/trino/client/query.rb @@ -18,6 +18,7 @@ module Trino::Client require 'faraday' require 'faraday/gzip' require 'faraday/follow_redirects' + require 'trino/client/column_value_parser' require 'trino/client/models' require 'trino/client/errors' require 'trino/client/faraday_client' @@ -44,6 +45,19 @@ def self.faraday_client(options) Trino::Client.faraday_client(options) end + def self.transform_row(column_value_parsers, row) + row_object = {} + + row.each_with_index do |element, i| + column = column_value_parsers[i] + value = column.value(element) + + row_object[column.name] = value + end + + row_object + end + def initialize(api) @api = api end @@ -86,6 +100,20 @@ def columns return @api.current_results.columns end + def column_value_parsers + @column_value_parsers ||= columns.map {|column| + ColumnValueParser.new(column) + } + end + + def transform_rows + rows.map(&:transform_row) + end + + def transform_row(row) + self.class.transform_row(column_value_parsers, row) + end + def rows rows = [] each_row_chunk {|chunk| diff --git a/spec/client_spec.rb b/spec/client_spec.rb index cb163b81..e0bc56f5 100644 --- a/spec/client_spec.rb +++ b/spec/client_spec.rb @@ -8,15 +8,17 @@ [ Models::Column.new(name: 'animal', type: 'string'), Models::Column.new(name: 'score', type: 'integer'), - Models::Column.new(name: 'name', type: 'string') + Models::Column.new(name: 'name', type: 'string'), + Models::Column.new(name: 'foods', type: 'array(string string)'), + Models::Column.new(name: 'traits', type: 'row(breed string, num_spots integer)') ] end it 'multiple rows' do rows = [ - ['dog', 1, 'Lassie'], - ['horse', 5, 'Mr. Ed'], - ['t-rex', 37, 'Doug'] + ['dog', 1, 'Lassie', ['kibble', 'peanut butter'], ['spaniel', 2]], + ['horse', 5, 'Mr. Ed', ['hay', 'sugar cubes'], ['some horse', 0]], + ['t-rex', 37, 'Doug', ['rodents', 'small dinos'], ['dino', 0]] ] client.stub(:run).and_return([columns, rows]) @@ -27,18 +29,61 @@ expect(rehashed[0]['animal']).to eq 'dog' expect(rehashed[0]['score']).to eq 1 expect(rehashed[0]['name']).to eq 'Lassie' + expect(rehashed[0]['foods']).to eq ['kibble', 'peanut butter'] + expect(rehashed[0]['traits']).to eq ['spaniel', 2] expect(rehashed[0].values[0]).to eq 'dog' expect(rehashed[0].values[1]).to eq 1 expect(rehashed[0].values[2]).to eq 'Lassie' + expect(rehashed[0].values[3]).to eq ['kibble', 'peanut butter'] + expect(rehashed[0].values[4]).to eq ['spaniel', 2] expect(rehashed[1]['animal']).to eq 'horse' expect(rehashed[1]['score']).to eq 5 expect(rehashed[1]['name']).to eq 'Mr. Ed' + expect(rehashed[1]['foods']).to eq ['hay', 'sugar cubes'] + expect(rehashed[1]['traits']).to eq ['some horse', 0] expect(rehashed[1].values[0]).to eq 'horse' expect(rehashed[1].values[1]).to eq 5 expect(rehashed[1].values[2]).to eq 'Mr. Ed' + expect(rehashed[1].values[3]).to eq ['hay', 'sugar cubes'] + expect(rehashed[1].values[4]).to eq ['some horse', 0] + end + + it 'transforms rows into Ruby objects' do + rows = [ + ['dog', 1, 'Lassie', ['kibble', 'peanut butter'], ['spaniel', 2]], + ['horse', 5, 'Mr. Ed', ['hay', 'sugar cubes'], ['some horse', 0]], + ['t-rex', 37, 'Doug', ['rodents', 'small dinos'], ['dino', 0]] + ] + client.stub(:run).and_return([columns, rows]) + + columns, rows = client.run('fake query') + column_value_parsers = columns.map { |column| Trino::Client::ColumnValueParser.new(column) } + transformed_rows = rows.map { |row| Trino::Client::Query.transform_row(column_value_parsers, row) } + + expect(transformed_rows[0]).to eq({ + "animal" => "dog", + "score" => 1, + "name" => "Lassie", + "foods" => ["kibble", "peanut butter"], + "traits" => { + "breed" => "spaniel", + "num_spots" => 2, + }, + }) + + expect(transformed_rows[1]).to eq({ + "animal" => "horse", + "score" => 5, + "name" => "Mr. Ed", + "foods" => ["hay", "sugar cubes"], + "traits" => { + "breed" => "some horse", + "num_spots" => 0, + }, + }) end it 'empty results' do @@ -58,17 +103,21 @@ "animal" => "wrong", "score" => "count", "name" => nil, + "foods" => nil, + "traits" => nil }] end it 'handles too many result columns' do - rows = [['wrong', 'count', 'too', 'much', 'columns']] + rows = [['wrong', 'count', 'too', 'too', 'too', 'much', 'columns']] client.stub(:run).and_return([columns, rows]) expect(client.run_with_names('fake query')).to eq [{ "animal" => "wrong", "score" => "count", - "name" => 'too', + "name" => "too", + "foods" => "too", + "traits" => "too" }] end end diff --git a/spec/column_value_parser_spec.rb b/spec/column_value_parser_spec.rb new file mode 100644 index 00000000..ea8dd955 --- /dev/null +++ b/spec/column_value_parser_spec.rb @@ -0,0 +1,125 @@ +require 'spec_helper' + +describe Trino::Client::ColumnValueParser do + def column_value(data, type, scalar_parser = nil) + column = Struct.new(:type, :name).new(type) + Trino::Client::ColumnValueParser.new(column, scalar_parser).value(data) + end + + it 'parses varchar values' do + data = 'a string' + type = 'varchar' + expected_value = 'a string' + expect(column_value(data, type)).to eq(expected_value) + end + + it 'converts scalar values if configured to do so' do + data = '2022-07-01T14:53:02Z' + type = 'timestamp with time zone' + scalar_parser = ->(value, _dtype) { Time.parse(value) } + expected_value = Time.parse(data) + expect(column_value(data, type, scalar_parser)).to eq(expected_value) + end + + it 'parses array type values' do + data = [1, 2, 3, 4] + type = 'array(integer, integer, integer, integer)' + expected_value = [1, 2, 3, 4] + expect(column_value(data, type)).to eq(expected_value) + end + + it 'parses row type values' do + data = [ + 'userId', + 'userLogin', + 'SKU_FREE', + 'TYPE_USER', + '2022-07-01T14:53:02Z', + '' + ] + type = 'row(id varchar, "name" varchar, plan_sku varchar, type varchar, created_at timestamp with time zone, organization_tenant_name varchar)' + expected_value = { + 'id' => 'userId', + 'name' => 'userLogin', + 'plan_sku' => 'SKU_FREE', + 'type' => 'TYPE_USER', + 'created_at' => '2022-07-01T14:53:02Z', + 'organization_tenant_name' => '' + } + expect(column_value(data, type)).to eq(expected_value) + end + + it 'parses an array of row type values' do + data = [[ + 'userId', + 'userLogin', + 'SKU_FREE', + 'TYPE_USER', + '2022-07-01T14:53:02Z', + '' + ]] + type = 'array(row(id varchar, "name" varchar, plan_sku varchar, type varchar, created_at timestamp with time zone, organization_tenant_name varchar))' + expected_value = [{ + 'id' => 'userId', + 'name' => 'userLogin', + 'plan_sku' => 'SKU_FREE', + 'type' => 'TYPE_USER', + 'created_at' => '2022-07-01T14:53:02Z', + 'organization_tenant_name' => '' + }] + expect(column_value(data, type)).to eq(expected_value) + end + + it 'parses row type values that have an array in them' do + data = [ + 'userId', + %w[userLogin1 userLogin2], + 'value' + ] + type = 'row(id varchar, logins array(varchar), onemore varchar)' + expected_value = { + 'id' => 'userId', + 'logins' => %w[userLogin1 userLogin2], + 'onemore' => 'value' + } + expect(column_value(data, type)).to eq(expected_value) + end + + it 'parses row type values that have a row in them' do + data = [ + 'userId', + ['userLogin', '2022-07-01T14:53:02Z', 1234], + 'value' + ] + type = 'row(id varchar, subobj row(login varchar, created_at timestamp with time zone, id integer), onemore varchar)' + expected_value = { + 'id' => 'userId', + 'subobj' => { + 'login' => 'userLogin', + 'created_at' => '2022-07-01T14:53:02Z', + 'id' => 1234 + }, + 'onemore' => 'value' + } + expect(column_value(data, type)).to eq(expected_value) + end + + it 'parses row type values that have nested rows in them' do + data = [ + 'userId', + ['userLogin', '2022-07-01T14:53:02Z', [1234]], + 'value' + ] + type = 'row(id varchar, subobj row(login varchar, created_at timestamp with time zone, id row(subid integer)), onemore varchar)' + expected_value = { + 'id' => 'userId', + 'subobj' => { + 'login' => 'userLogin', + 'created_at' => '2022-07-01T14:53:02Z', + 'id' => {'subid' => 1234} + }, + 'onemore' => 'value' + } + expect(column_value(data, type)).to eq(expected_value) + end +end