-
Notifications
You must be signed in to change notification settings - Fork 175
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make merge write-disposition fall back to append if no primary or merge keys are specified #1225
Changes from 2 commits
eebbb65
054b577
6ef720a
46d1fea
dfbba60
d2738a3
8fc29fa
796f9ce
9432ebb
c46d0e3
1214db0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -226,6 +226,10 @@ def destinations_configs( | |
DestinationTestConfiguration(destination="synapse", supports_dbt=False), | ||
] | ||
|
||
# sanity check that when selecting default destinations, one of each sql destination is actually | ||
# provided | ||
assert set(SQL_DESTINATIONS) == {d.destination for d in destination_configs} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this has happened a few times already... not really related to this pr |
||
|
||
if default_vector_configs: | ||
# for now only weaviate | ||
destination_configs += [DestinationTestConfiguration(destination="weaviate")] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
import pytest | ||
import random | ||
from os import environ | ||
import io | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i pulled this over from my filesystem state branch because i needed those changes here. |
||
|
||
import dlt | ||
from dlt.common import json, sleep | ||
|
@@ -80,7 +81,7 @@ def assert_data_table_counts(p: dlt.Pipeline, expected_counts: DictStrAny) -> No | |
), f"Table counts do not match, expected {expected_counts}, got {table_counts}" | ||
|
||
|
||
def load_file(path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: | ||
def load_file(fs_client, path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: | ||
""" | ||
util function to load a filesystem destination file and return parsed content | ||
values may not be cast to the right type, especially for insert_values, please | ||
|
@@ -96,47 +97,43 @@ def load_file(path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: | |
|
||
# table name will be last element of path | ||
table_name = path.split("/")[-1] | ||
|
||
# skip loads table | ||
if table_name == "_dlt_loads": | ||
return table_name, [] | ||
|
||
full_path = posixpath.join(path, file) | ||
|
||
# load jsonl | ||
if ext == "jsonl": | ||
with open(full_path, "rU", encoding="utf-8") as f: | ||
for line in f: | ||
file_text = fs_client.read_text(full_path) | ||
for line in file_text.split("\n"): | ||
if line: | ||
result.append(json.loads(line)) | ||
|
||
# load insert_values (this is a bit volatile if the exact format of the source file changes) | ||
elif ext == "insert_values": | ||
with open(full_path, "rU", encoding="utf-8") as f: | ||
lines = f.readlines() | ||
# extract col names | ||
cols = lines[0][15:-2].split(",") | ||
for line in lines[2:]: | ||
file_text = fs_client.read_text(full_path) | ||
lines = file_text.split("\n") | ||
cols = lines[0][15:-2].split(",") | ||
for line in lines[2:]: | ||
if line: | ||
values = line[1:-3].split(",") | ||
result.append(dict(zip(cols, values))) | ||
|
||
# load parquet | ||
elif ext == "parquet": | ||
import pyarrow.parquet as pq | ||
|
||
with open(full_path, "rb") as f: | ||
table = pq.read_table(f) | ||
cols = table.column_names | ||
count = 0 | ||
for column in table: | ||
column_name = cols[count] | ||
item_count = 0 | ||
for item in column.to_pylist(): | ||
if len(result) <= item_count: | ||
result.append({column_name: item}) | ||
else: | ||
result[item_count][column_name] = item | ||
item_count += 1 | ||
count += 1 | ||
file_bytes = fs_client.read_bytes(full_path) | ||
table = pq.read_table(io.BytesIO(file_bytes)) | ||
cols = table.column_names | ||
count = 0 | ||
for column in table: | ||
column_name = cols[count] | ||
item_count = 0 | ||
for item in column.to_pylist(): | ||
if len(result) <= item_count: | ||
result.append({column_name: item}) | ||
else: | ||
result[item_count][column_name] = item | ||
item_count += 1 | ||
count += 1 | ||
|
||
return table_name, result | ||
|
||
|
@@ -149,18 +146,14 @@ def load_files(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, A | |
client.dataset_path, detail=False, refresh=True | ||
): | ||
for file in files: | ||
table_name, items = load_file(basedir, file) | ||
table_name, items = load_file(client.fs_client, basedir, file) | ||
if table_name not in table_names: | ||
continue | ||
if table_name in result: | ||
result[table_name] = result[table_name] + items | ||
else: | ||
result[table_name] = items | ||
|
||
# loads file is special case | ||
if LOADS_TABLE_NAME in table_names and file.find(".{LOADS_TABLE_NAME}."): | ||
result[LOADS_TABLE_NAME] = [] | ||
|
||
return result | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this test shows that filesystem always falls back to append when "merge" is set. this is also what it says in the docs. I'm not sure why this test says it would replace when there is a primary key, this did not work before and it does not work that way now and this test was somehow very strange (not sure if I wrote that) but now it runs correctly. I we want to replace in certain cases, I have to add it and specify that in the docs.