Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scientific Metadata Search Engine client service using TypeSense #697

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions example_configs/typesense.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
authentication:
# The default is false. Set to true to enable any HTTP client that can
# connect to _read_. An API key is still required to write.
allow_anonymous_access: false
single_user_api_key: "secret" # for dev
trees:
- path: /
tree: catalog
args:
uri: "sqlite+aiosqlite:///:memory:"
# or, uri: "sqlite+aiosqlite:////catalog.db"
# or, "postgresql+asyncpg://..."
writable_storage: "tmp/"
init_if_not_exists: true
typesense_client:
api_key: "secret"
nodes:
- host: "http://localhost"
port: 8108
protocol: "http"
# Either a predefined default schema from the Tiled system
# Or a Filename of another .yaml file that defines the schema
# Finally, the schema can be defined here as fields
# Each is idempotent to Typesense Collection Schema
# The source refers to the path in the metadata to extract the value
# https://typesense.org/docs/26.0/api/collections.html#schema-parameters
schemas:
- "example_configs/typesense_schemas/extensible_typesense_schema.yml"
- name: "name"
type: "string"
facet: false
source: "*name"
- name: "description"
type: "string"
source: "start.detectors.description"
- name: "location"
type: "string"
source: "start.detectors.location"
- name: "edible"
type: "bool"
facet: true
source: "start.sample.can_be_eaten"
- name: "number"
type: "int32"
facet: false
source: "start.sample.number"
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
schemas:
- name: "weight"
type: "string"
facet: true
source: "start.sample.weight"
- name: "sample_description"
type: "string"
source: "start.sample.description"
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ all = [
"xarray",
"zarr",
"zstandard",
"typesense",
]
# These are needed by the client and server to transmit/receive arrays.
array = [
Expand Down
82 changes: 81 additions & 1 deletion tiled/catalog/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
import shutil
import sys
import uuid
import yaml
from functools import partial, reduce
from pathlib import Path
from typing import Callable, Dict
from urllib.parse import quote_plus, urlparse

import anyio
import typesense
from fastapi import HTTPException
from sqlalchemy import (
delete,
Expand Down Expand Up @@ -145,9 +147,11 @@ def __init__(
writable_storage=None,
readable_storage=None,
adapters_by_mimetype=None,
typesense_client=None,
key_maker=lambda: str(uuid.uuid4()),
):
self.engine = engine
self.typesense_client = typesense_client
readable_storage = readable_storage or []
if not isinstance(readable_storage, list):
raise ValueError("readable_storage should be a list of URIs or paths")
Expand Down Expand Up @@ -313,6 +317,7 @@ def metadata(self):
return self.node.metadata_

async def startup(self):
print(dir(self.context.typesense_client))
if (self.context.engine.dialect.name == "sqlite") and (
self.context.engine.url.database == ":memory:"
):
Expand Down Expand Up @@ -1321,6 +1326,7 @@ def in_memory(
readable_storage=None,
echo=DEFAULT_ECHO,
adapters_by_mimetype=None,
typesense_client=None,
):
uri = "sqlite+aiosqlite:///:memory:"
return from_uri(
Expand All @@ -1332,6 +1338,7 @@ def in_memory(
readable_storage=readable_storage,
echo=echo,
adapters_by_mimetype=adapters_by_mimetype,
typesense_client=typesense_client,
)


Expand All @@ -1346,6 +1353,7 @@ def from_uri(
init_if_not_exists=False,
echo=DEFAULT_ECHO,
adapters_by_mimetype=None,
typesense_client=None,
):
uri = ensure_specified_sql_driver(uri)
if init_if_not_exists:
Expand Down Expand Up @@ -1385,8 +1393,20 @@ def from_uri(
)
if engine.dialect.name == "sqlite":
event.listens_for(engine.sync_engine, "connect")(_set_sqlite_pragma)
if typesense_client:
# Parse the extensible schema into a typesense client compatible format:
typesense_schema = build_ts_schema(typesense_client["schemas"])
typesense_client = {
"client": typesense.Client(
{
"api_key": typesense_client["api_key"],
"nodes": typesense_client["nodes"],
}
),
"schema": typesense_schema,
}
return CatalogContainerAdapter(
Context(engine, writable_storage, readable_storage, adapters_by_mimetype),
Context(engine, writable_storage, readable_storage, adapters_by_mimetype, typesense_client),
RootNode(metadata, specs, access_policy),
access_policy=access_policy,
)
Expand Down Expand Up @@ -1467,6 +1487,66 @@ def specs_array_to_json(specs):
return [{"name": spec} for spec in specs]


def build_ts_schema(ts_schema):
"""Builds a valid typescript schema from either a schema defined in the config yaml
or from a series of successive yaml files containing the same schema.

Parameters
----------
ts_schema : list
A raw ts_schema object containing either schema data or a string that points to a file.

Returns
-------
schema_objects : list
A list of schema objects that can be used to create a typesense collection or locate properties.

Examples
--------
>>> build_ts_schema([{"name":"thing", "type":"string", "source":"start.thing"},"additional_schema.yml"])
[
{"name":"thing", "type":"string", "source":"start.thing"},
{"name":"additional_thing-from-file", "type":"string", "source":"start.additional_thing"}
]
"""
schema_objects = []
# Add more control over iteration using stack var
stack = [iter(ts_schema)]
opened_files = set() # Track opened files
while stack:
try:
item = next(stack[-1])
if isinstance(item, str):
if item in opened_files:
continue # Skip if file already opened
try:
with open(item, "r") as file:
opened_files.add(item) # Add file name to opened files
schema_list = yaml.safe_load(file)
if "schemas" in schema_list:
additional_schemas = schema_list["schemas"]
stack.append(iter(additional_schemas))
else:
continue
except FileNotFoundError:
# Handle file not found error
print(f"File {item} not found")
continue
except SyntaxError:
# Handle invalid list syntax in file
print(f"Syntax error in file {item}")
continue
elif isinstance(item, dict):
schema_objects.append(item)
elif callable(item):
result = item()
if isinstance(result, dict):
schema_objects.append(result)
except StopIteration:
stack.pop()
return schema_objects


STRUCTURES = {
StructureFamily.array: CatalogArrayAdapter,
StructureFamily.awkward: CatalogAwkwardAdapter,
Expand Down
Loading