Skip to content

Commit

Permalink
Refactored projections to commute transforms beforehand
Browse files Browse the repository at this point in the history
  • Loading branch information
dsblank committed May 19, 2023
1 parent 193b0f6 commit 1494b7b
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 122 deletions.
98 changes: 48 additions & 50 deletions backend/kangas/datatypes/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import random
import time

from ..server.utils import Cache, pickle_dumps
from ..server.utils import Cache
from .base import Asset
from .utils import get_color, get_file_extension, is_valid_file_path

Expand Down Expand Up @@ -174,6 +174,8 @@ def get_statistics(cls, datagrid, col_name, field_name):

projection = None
batch = []
asset_ids = []

for row in datagrid.conn.execute(
"""SELECT {field_name} as assetId, asset_data, asset_metadata from datagrid JOIN assets ON assetId = assets.asset_id;""".format(
field_name=field_name
Expand All @@ -195,71 +197,67 @@ def get_statistics(cls, datagrid, col_name, field_name):
asset_data = json.loads(asset_data_json)
vector = prepare_embedding(asset_data["vector"], dimensions, seed)

# Save asset_id to update assets next
batch.append(vector)
if projection is None or projection == "pca":
asset_ids.append(asset_id)

if projection == "pca":
projection_name = "pca"
elif projection == "t-sne":
projection_name = "t-sne"
elif projection == "umap":
projection_name = "umap"
else:
raise Exception("projection not found")

if projection_name == "pca":
from sklearn.decomposition import PCA

projection = PCA(n_components=2)
transformed = projection.fit_transform(np.array(batch))
x_max = float(transformed[:, 0].max())
x_min = float(transformed[:, 0].min())
y_max = float(transformed[:, 1].max())
y_min = float(transformed[:, 1].min())
x_span = abs(x_max - x_min)
x_max += x_span * 0.1
x_min -= x_span * 0.1
y_span = abs(y_max - y_min)
y_max += y_span * 0.1
y_min -= y_span * 0.1
other = json.dumps(
{
"pca_eigen_vectors": projection.components_.tolist(),
"pca_mean": projection.mean_.tolist(),
"projection": projection_name,
"x_range": [x_min, x_max],
"y_range": [y_min, y_max],
"dimensions": dimensions,
"seed": seed,
}
)

elif projection_name == "t-sne":
from openTSNE import TSNE
from sklearn.manifold import TSNE

projection = TSNE()
transformed = projection.fit(np.array(batch))
x_max = float(transformed[:, 0].max())
x_min = float(transformed[:, 0].min())
y_max = float(transformed[:, 1].max())
y_min = float(transformed[:, 1].min())
x_span = abs(x_max - x_min)
x_max += x_span * 0.1
x_min -= x_span * 0.1
y_span = abs(y_max - y_min)
y_max += y_span * 0.1
y_min -= y_span * 0.1
other = json.dumps(
{
"projection": projection_name,
"pickled_projection": pickle_dumps(transformed),
"x_range": [x_min, x_max],
"y_range": [y_min, y_max],
"dimensions": dimensions,
"seed": seed,
}
)
transformed = projection.fit_transform(np.array(batch))

elif projection_name == "umap":
projection_name = (projection_name,)
other = json.dumps(
{
"projection": projection_name,
}
pass # TODO

x_max = float(transformed[:, 0].max())
x_min = float(transformed[:, 0].min())
y_max = float(transformed[:, 1].max())
y_min = float(transformed[:, 1].min())
x_span = abs(x_max - x_min)
x_max += x_span * 0.1
x_min -= x_span * 0.1
y_span = abs(y_max - y_min)
y_max += y_span * 0.1
y_min -= y_span * 0.1
other = json.dumps(
{
"x_range": [x_min, x_max],
"y_range": [y_min, y_max],
}
)

# update assets with transformed
cursor = datagrid.conn.cursor()
for asset_id, tran in zip(asset_ids, transformed):
sql = """SELECT asset_data from assets WHERE asset_id = ?;"""
asset_data_json = datagrid.conn.execute(sql, (asset_id,)).fetchone()[0]
asset_data = json.loads(asset_data_json)
asset_data["projection_transform"] = tran.tolist()
asset_data_json = json.dumps(asset_data)
sql = """UPDATE assets SET asset_data = ? WHERE asset_id = ?;"""
cursor.execute(
sql,
(
asset_data_json,
asset_id,
),
)
datagrid.conn.commit()

return [minimum, maximum, avg, variance, total, stddev, other, name]
81 changes: 10 additions & 71 deletions backend/kangas/server/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,7 @@
pytype_to_dgtype,
)
from .computed_columns import unify_computed_columns, update_state
from .utils import (
Cache,
pickle_loads_embedding_unsafe,
process_about,
safe_compile,
safe_env,
)
from .utils import Cache, process_about, safe_compile, safe_env

LOGGER = logging.getLogger(__name__)
KANGAS_ROOT = os.environ.get("KANGAS_ROOT", ".")
Expand All @@ -63,7 +57,6 @@
VALID_CHARS = string.ascii_letters + string.digits + "_"

PROJECTION_TRACE_CACHE = Cache(100)
PROJECTION_EMBEDDING_CACHE = Cache(50)


def sqlite_query_explain(
Expand Down Expand Up @@ -2233,17 +2226,11 @@ def process_projection_asset_ids(
name,
cur,
asset_ids,
projection_name,
projection,
traces,
size,
default_color,
color_override=None,
projection_dimensions=None,
projection_seed=None,
):
from ..datatypes.embedding import prepare_embedding

# asset_ids is a list of str
# side-effect: adds to traces
# Turn to string:
Expand All @@ -2256,14 +2243,11 @@ def process_projection_asset_ids(
)

trace_data = {}
print("using seed", projection_seed)

for asset_data_row in cur.execute(sql):
asset_data_raw = asset_data_row[0]
asset_data = json.loads(asset_data_raw)
vector_reduced = prepare_embedding(
asset_data["vector"], projection_dimensions, projection_seed
)
transform = asset_data["projection_transform"]
if color_override:
color = color_override
elif asset_data["color"]:
Expand All @@ -2283,26 +2267,25 @@ def process_projection_asset_ids(

if trace_name not in trace_data:
trace_data[trace_name] = {
"vectors": [],
"transform": [],
"colors": [],
"texts": [],
"customdata": [],
}

trace_data[trace_name]["texts"].append(asset_data.get("text"))
trace_data[trace_name]["vectors"].append(vector_reduced)
trace_data[trace_name]["transform"].append(transform)
trace_data[trace_name]["colors"].append(color)
trace_data[trace_name]["customdata"].append(row_id)

for trace_name in trace_data:
vectors = trace_data[trace_name]["vectors"]
transform = trace_data[trace_name]["transform"]
texts = trace_data[trace_name]["texts"]
colors = trace_data[trace_name]["colors"]
customdata = trace_data[trace_name]["customdata"]

eigen_vector = projection.transform(np.array(vectors))
xs = eigen_vector[:, 0].tolist()
ys = eigen_vector[:, 1].tolist()
xs = [xy[0] for xy in transform]
ys = [xy[1] for xy in transform]

if texts:
if any(texts):
Expand Down Expand Up @@ -2340,47 +2323,14 @@ def select_projection_data(
where_expr,
computed_columns,
):
from ..datatypes.embedding import prepare_embedding

conn = get_database_connection(dgid)
cur = conn.cursor()
unify_computed_columns(computed_columns)
metadata = get_metadata(conn)
column_limit = None
column_offset = 0

if "projection" in metadata[column_name]["other"]:
projection_name = metadata[column_name]["other"]["projection"]
else:
projection_name = "pca"

if projection_name == "pca":
from sklearn.decomposition import PCA

pca_eigen_vectors = metadata[column_name]["other"]["pca_eigen_vectors"]
pca_mean = metadata[column_name]["other"]["pca_mean"]
projection = PCA(n_components=2)
projection.components_ = np.array(pca_eigen_vectors)
projection.mean_ = np.array(pca_mean)
elif projection_name == "t-sne":
# FIXME: Trying to prevent an error on first load; race condition?
from openTSNE import TSNE # noqa

ascii_string = metadata[column_name]["other"]["pickled_projection"]
if not PROJECTION_EMBEDDING_CACHE.contains(ascii_string):
PROJECTION_EMBEDDING_CACHE.put(
ascii_string, pickle_loads_embedding_unsafe(ascii_string)
)
projection = PROJECTION_EMBEDDING_CACHE.get(ascii_string)

elif projection_name == "umap":
pass
else:
return

default_color = get_color(column_name)
projection_dimensions = metadata[column_name]["other"]["dimensions"]
projection_seed = metadata[column_name]["other"]["seed"]

traces = []
if asset_id:
Expand Down Expand Up @@ -2412,14 +2362,10 @@ def select_projection_data(
"Sampled Data",
cur,
[row[0] for row in rows],
projection_name,
projection,
traces,
3,
default_color,
"lightgray",
projection_dimensions,
projection_seed,
)
PROJECTION_TRACE_CACHE.put(key, traces)
# Traces contains projection data; make copy:
Expand All @@ -2428,10 +2374,7 @@ def select_projection_data(
# Next, add the selected asset:
asset_data_raw = select_asset(dgid, asset_id)
asset_data = json.loads(asset_data_raw)
vector_reduced = prepare_embedding(
asset_data["vector"], projection_dimensions, projection_seed
)
transformed = projection.transform(np.array([vector_reduced]))
transform = asset_data["projection_transform"]
if asset_data["color"]:
color = asset_data["color"]
else:
Expand All @@ -2443,8 +2386,8 @@ def select_projection_data(
text = asset_data.get("text", column_name)

trace = {
"x": [transformed[0][0]],
"y": [transformed[0][1]],
"x": [transform[0]],
"y": [transform[1]],
"text": text,
"name": text,
"type": "scatter",
Expand Down Expand Up @@ -2484,14 +2427,10 @@ def select_projection_data(
None,
cur,
values,
projection_name,
projection,
traces,
3,
default_color,
None,
projection_dimensions,
projection_seed,
)
PROJECTION_TRACE_CACHE.put(key, traces)
# Traces contains projection data; make copy:
Expand Down
1 change: 0 additions & 1 deletion backend/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ def get_version(file, name="__version__"):
"scikit-learn",
"scipy",
"waitress",
"opentsne",
],
packages=[
"kangas",
Expand Down

0 comments on commit 1494b7b

Please sign in to comment.