superduper-io · makkarss929 · Mar 31, 2024 · Apr 1, 2024 · Apr 2, 2024 · Apr 6, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Rename `_Predictor` to `Model`
 - Allow developers to write `Listeners` and `Graph` in a single formalism
 - Change unittesting framework to pure configuration (no patching configs)
+- Adding `PostgresDataBackend` for `Pgvector` integration
 
 #### Bug Fixes
 - Fixed a bug in refresh_after_insert for listeners with select None

diff --git a/superduperdb/backends/base/backends.py b/superduperdb/backends/base/backends.py
@@ -2,6 +2,7 @@
 from pymongo import MongoClient
 
 from superduperdb.backends.ibis.data_backend import IbisDataBackend
+from superduperdb.backends.postgres.data_backend import PostgresDataBackend
 from superduperdb.backends.local.artifacts import FileSystemArtifactStore
 from superduperdb.backends.mongodb.artifacts import MongoArtifactStore
 from superduperdb.backends.mongodb.data_backend import MongoDataBackend
@@ -10,10 +11,13 @@
 from superduperdb.vector_search.atlas import MongoAtlasVectorSearcher
 from superduperdb.vector_search.in_memory import InMemoryVectorSearcher
 from superduperdb.vector_search.lance import LanceVectorSearcher
+from superduperdb.vector_search.postgres import PostgresVectorSearcher
+
 
 data_backends = {
     'mongodb': MongoDataBackend,
     'ibis': IbisDataBackend,
+    'postgres' : PostgresDataBackend
 }
 
 artifact_stores = {
@@ -30,6 +34,7 @@
     'lance': LanceVectorSearcher,
     'in_memory': InMemoryVectorSearcher,
     'mongodb+srv': MongoAtlasVectorSearcher,
+    'postgres': PostgresVectorSearcher
 }
 
 CONNECTIONS = {

diff --git a/superduperdb/backends/postgres/data_backend.py b/superduperdb/backends/postgres/data_backend.py
@@ -0,0 +1,106 @@
+from superduperdb.backends.ibis.data_backend import IbisDataBackend
+
+
+import typing as t
+from warnings import warn
+
+import ibis
+import pandas
+from ibis.backends.base import BaseBackend
+
+from superduperdb.backends.ibis.db_helper import get_db_helper
+from superduperdb.backends.ibis.field_types import FieldType, dtype
+from superduperdb.backends.ibis.query import Table
+from superduperdb.backends.local.artifacts import FileSystemArtifactStore
+from superduperdb.backends.sqlalchemy.metadata import SQLAlchemyMetadata
+from superduperdb.components.datatype import DataType
+from superduperdb.components.schema import Schema
+
+BASE64_PREFIX = 'base64:'
+INPUT_KEY = '_input_id'
+
+
+
+
+class PostgresDataBackend(IbisDataBackend):
+    def __init__(self, conn: BaseBackend, name: str, in_memory: bool = False):
+        super().__init__(conn=conn, name=name)
+        self.in_memory = in_memory
+        self.dialect = getattr(conn, 'name', 'base')
+        self.db_helper = get_db_helper(self.dialect)
+
+    def url(self):
+        return self.conn.con.url + self.name
+
+    def build_artifact_store(self):
+        return FileSystemArtifactStore(conn='.superduperdb/artifacts/', name='ibis')
+
+    def build_metadata(self):
+        return SQLAlchemyMetadata(conn=self.conn.con, name='ibis')
+
+    def create_ibis_table(self, identifier: str, schema: Schema):
+        self.conn.create_table(identifier, schema=schema)
+
+    def insert(self, table_name, raw_documents):
+        for doc in raw_documents:
+            for k, v in doc.items():
+                doc[k] = self.db_helper.convert_data_format(v)
+        table_name, raw_documents = self.db_helper.process_before_insert(
+            table_name, raw_documents
+        )
+        if not self.in_memory:
+            self.conn.insert(table_name, raw_documents)
+        else:
+            self.conn.create_table(table_name, pandas.DataFrame(raw_documents))
+
+    def create_output_dest(
+        self, predict_id: str, datatype: t.Union[FieldType, DataType]
+    ):
+        msg = (
+            "Model must have an encoder to create with the"
+            f" {type(self).__name__} backend."
+        )
+        assert datatype is not None, msg
+        if isinstance(datatype, FieldType):
+            output_type = dtype(datatype.identifier)
+        else:
+            output_type = datatype
+        fields = {
+            INPUT_KEY: dtype('string'),
+            'output': output_type,
+        }
+        return Table(
+            identifier=f'_outputs.{predict_id}',
+            schema=Schema(identifier=f'_schema/{predict_id}', fields=fields),
+        )
+
+    def create_table_and_schema(self, identifier: str, mapping: dict):
+        """
+        Create a schema in the data-backend.
+        """
+
+        try:
+            mapping = self.db_helper.process_schema_types(mapping)
+            t = self.conn.create_table(identifier, schema=ibis.schema(mapping))
+        except Exception as e:
+            if 'exists' in str(e):
+                warn("Table already exists, skipping...")
+                t = self.conn.table(identifier)
+            else:
+                raise e
+        return t
+
+    def drop(self, force: bool = False):
+        raise NotImplementedError(
+            "Dropping tables needs to be done in each DB natively"
+        )
+
+    def get_table_or_collection(self, identifier):
+        return self.conn.table(identifier)
+
+    def disconnect(self):
+        """
+        Disconnect the client
+        """
+
+        # TODO: implement me
diff --git a/superduperdb/base/build.py b/superduperdb/base/build.py
@@ -117,6 +117,18 @@ def _build_databackend_impl(uri, mapping, type: str = 'data_backend'):
         name = uri.split('/')[-1]
         conn = mongomock.MongoClient()
         return mapping['mongodb'](conn, name)
+
+    elif uri.startswith('postgres://') or uri.startswith("postgresql://"):
+        name = uri.split('//')[0]
+        if type == 'data_backend':
+            ibis_conn = ibis.connect(uri)
+            return mapping['postgres'](ibis_conn, name)
+        else:
+            assert type == 'metadata'
+            from sqlalchemy import create_engine
+
+            sql_conn = create_engine(uri)
+            return mapping['sqlalchemy'](sql_conn, name)
 
     elif uri.endswith('.csv'):
         if type == 'metadata':

diff --git a/superduperdb/base/superduper.py b/superduperdb/base/superduper.py
@@ -35,6 +35,9 @@ def _auto_identify_connection_string(item: str, **kwargs) -> t.Any:
     elif item.startswith('mongodb+srv://') and 'mongodb.net' in item:
         kwargs['data_backend'] = item
 
+    elif item.startswith('postgres://') or item.startswith('postgresql://'):
+        kwargs['data_backend'] = item
+
     elif item.endswith('.csv'):
         kwargs['data_backend'] = item
 

diff --git a/superduperdb/vector_search/postgres.py b/superduperdb/vector_search/postgres.py
@@ -0,0 +1,151 @@
+import json
+import typing as t
+import numpy
+from pgvector.psycopg import psycopg, register_vector
+
+
+from superduperdb.vector_search.base import BaseVectorSearcher, VectorItem, VectorIndexMeasureType
+
+
+class PostgresVectorSearcher(BaseVectorSearcher):
+    """
+    Implementation of a vector index using the ``pgvector`` library.
+    :param identifier: Unique string identifier of index
+    :param dimensions: Dimension of the vector embeddings in the Lance dataset
+    :param uri: connection string to postgres
+    :param h: ``torch.Tensor``
+    :param index: list of IDs
+    :param measure: measure to assess similarity
+    """
+
+    def __init__(
+        self,
+        identifier: str,
+        dimensions: int,
+        conninfo: str,
+        h: t.Optional[numpy.ndarray] = None,
+        index: t.Optional[t.List[str]] = None,
+        measure: t.Optional[str] = None,
+    ):
+        self.connection = psycopg.connect(conninfo=conninfo)
+        self.dimensions = dimensions
+        self.identifier = identifier
+        self.measure: VectorIndexMeasureType = VectorIndexMeasureType.cosine
+        self.measure_query = self.get_measure_query()
+
+        with self.connection.cursor() as cursor:
+            cursor.execute('CREATE EXTENSION IF NOT EXISTS vector')
+            cursor.execute(
+                'CREATE TABLE IF NOT EXISTS %s (id varchar, embedding vector(%d))'
+                % (self.identifier, self.dimensions)
+            )
+        register_vector(self.connection)
+        if h:
+            self._create_or_append_to_dataset(h, index)
+
+
+    def __len__(self):
+        with self.connection.cursor() as curr:
+            length = curr.execute(
+                'SELECT COUNT(*) FROM %s' % self.identifier
+            ).fetchone()[0]
+        return length
+
+    def get_measure_query(self):
+        if self.measure.value == "l2":
+            return "embedding <-> '%s'"
+        elif self.measure.value == "dot":
+            return "(embedding <#> '%s') * -1"
+        elif self.measure.value == "cosine":
+            return "1 - (embedding <=> '%s')"
+        else:
+            raise NotImplementedError("Unrecognized measure format")
+
+
+    def _create_or_append_to_dataset(self, vectors, ids):
+        with self.connection.cursor().copy(
+            'COPY %s (id, embedding) FROM STDIN WITH (FORMAT BINARY)' % self.identifier
+        ) as copy:
+            copy.set_types(['varchar', 'vector'])
+            for id_vector, vector in zip(ids, vectors):
+                copy.write_row([id_vector, vector])
+        self.connection.commit()
+
+
+    def add(self, items: t.Sequence[VectorItem]) -> None:
+        """
+        Add items to the index.
+        :param items: t.Sequence of VectorItems
+        """
+        ids = [item.id for item in items]
+        vectors = [item.vector for item in items]
+        self._create_or_append_to_dataset(vectors, ids)
+
+
+    def delete(self, ids: t.Sequence[str]) -> None:
+        """
+        Remove items from the index
+        :param ids: t.Sequence of ids of vectors.
+        """
+        with self.connection.cursor() as curr:
+            for id_vector in ids:
+                curr.execute(
+                    "DELETE FROM %s WHERE id = '%s'" % (self.identifier, id_vector)
+                )
+        self.connection.commit()
+
+
+    def find_nearest_from_id(
+        self,
+        _id,
+        n: int = 100,
+        within_ids: t.Sequence[str] = (),
+    ) -> t.Tuple[t.List[str], t.List[float]]:
+        """
+        Find the nearest vectors to the vector with the given id.
+        :param _id: id of the vector
+        :param n: number of nearest vectors to return
+        """
+        with self.connection.cursor() as curr:
+            curr.execute(
+                """
+                SELECT embedding 
+                FROM %s 
+                WHERE id = '%s'"""
+                % (self.identifier, _id)
+            )
+            h = curr.fetchone()[0]
+        return self.find_nearest_from_array(h, n, within_ids)
+
+    def find_nearest_from_array(
+        self,
+        h: numpy.typing.ArrayLike,
+        n: int = 100,
+        within_ids: t.Sequence[str] = (),
+    ) -> t.Tuple[t.List[str], t.List[float]]:
+        """
+        Find the nearest vectors to the given vector.
+        :param h: vector
+        :param n: number of nearest vectors to return
+        """
+        h = self.to_numpy(h)[None, :]
+        if len(within_ids) == 0:
+            condition = "1=1"
+        else:
+            within_ids_str = ', '.join([f"'{i}'" for i in within_ids])
+            condition = f"id in ({within_ids_str})"
+        query_search_nearest = f"""
+        SELECT id, {self.measure_query} as distance
+                FROM %s
+                WHERE %s
+                ORDER BY distance
+                LIMIT %d
+            """
+        with self.connection.cursor() as curr:
+            curr.execute(
+                query_search_nearest % (json.dumps(h), self.identifier, condition, n)
+            )
+            nearest_items = curr.fetchall()
+        ids = [row[0] for row in nearest_items]
+        scores = [row[1] for row in nearest_items]
+        return ids, scores