Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Abstract classes using msgspec #742

Open
arthurbrenno opened this issue Oct 9, 2024 · 0 comments
Open

Abstract classes using msgspec #742

arthurbrenno opened this issue Oct 9, 2024 · 0 comments

Comments

@arthurbrenno
Copy link

Question

How can I simulate abstract classes using msgspec? I know it's not the main intention of msgspec, but I really want to represent a hierarchy of classes using it. Example:

from __future__ import annotations
from cortex.base import CPrototype
from typing import List
from cortex.core.llms import AIModel


class CPrototype(msgspec.Struct):
    """
    Base schema for all models in the application.

    This class extends msgspec's Struct to provide additional functionality,
    specifically the ability to create model instances from JSON strings or
    dictionaries, with built-in error handling and JSON repair capabilities
    to handle malformed Large Language Model json string returns.

    Attributes:
        Inherits all attributes from msgspec.Struct.

    Example:
        >>> class Person(CPrototype):
        ...     name: str
        ...     age: int
        ...
        >>> json_data = '{"name": "Alice", "age": 30}'
        >>> person = Person.from_json(json_data)
        >>> print(person)
        Person(name='Alice', age=30)
    """

    def __post_init__(self) -> None:
        if self.__class__ is CPrototype:
            raise TypeError("CPrototype class cannot be instantiated directly")

    @classmethod
    def from_json(cls: Type[T], json_content: str | dict) -> T:
        """
        Convert a JSON string or dictionary to a CPrototype instance.

        This method handles JSON strings that may be enclosed in code blocks,
        and attempts to repair malformed JSON before parsing. It can also
        accept dictionaries directly.

        Args:
            json_content (Union[str, Dict[str, Any]]): The JSON string or dictionary to convert.
                If a string is provided, it may be enclosed in ```json ... ``` markers.

        Returns:
            T: An instance of the class that called this method (a subclass of CPrototype).

        Raises:
            ValueError: If the conversion fails due to invalid JSON or model mismatch.

        Examples:
            Creating an instance from a well-formed JSON string:
            >>> class User(CPrototype):
            ...     username: str
            ...     email: str
            ...
            >>> json_str = '{"username": "john_doe", "email": "[email protected]"}'
            >>> user = User.from_json(json_str)
            >>> print(user)
            User(username='john_doe', email='[email protected]')

            Creating an instance from a dictionary:
            >>> data_dict = {"username": "jane_doe", "email": "[email protected]"}
            >>> user = User.from_json(data_dict)
            >>> print(user)
            User(username='jane_doe', email='[email protected]')

            Handling malformed JSON:
            >>> malformed_json = '{"username": "bob" "email": "[email protected]"}'
            >>> user = User.from_json(malformed_json)
            >>> print(user)
            User(username='bob', email='[email protected]')

            Handling JSON with code block markers:
            >>> json_with_markers = '''```json
            ... {"username": "alice", "email": "[email protected]"}
            ... ```'''
            >>> user = User.from_json(json_with_markers)
            >>> print(user)
            User(username='alice', email='[email protected]')
        """
        if isinstance(json_content, dict):
            json_content = msgspec.json.encode(json_content).decode("utf-8")

        try:
            model: CPrototype = msgspec.json.decode(json_content, type=cls)
            return model
        except msgspec.DecodeError as e:
            logger.exception(e)
            model = msgspec.json.decode(json_repair.repair_json(json_content), type=cls)
            return model

    @classmethod
    def from_dict(cls: Type[T], dictionary: Dict[str, Any], /) -> T:
        return cls.from_json(dictionary)

    def as_dict(self) -> dict:
        """
        Convert the model instance to a JSON string.

        This method serializes the model instance to a JSON string using msgspec's
        built-in JSON encoder. The resulting string is formatted with indentation
        for readability.

        Returns:
            str: The JSON representation of the model instance.

        Example:
            >>> class User(CPrototype
            ...     username: str
            ...     email: str
            ...
            >>> user = User(username="alice", email="[email protected]")
            >>> print(user.as_json())
            {
              "username": "alice",
              "email": "[email protected]"
            }
        """

        def enc_hook(obj: Any) -> Any:
            if isinstance(obj, np.float64):
                return float(obj)
            if isinstance(obj, np.int64):
                return int(obj)
            if isinstance(obj, np.ndarray):
                return obj.tolist()
            return obj

        # Encoding objects of type numpy.float64 is unsupported
        _dict: dict = msgspec.json.decode(
            msgspec.json.encode(self, enc_hook=enc_hook), type=dict
        )
        return _dict

    @classmethod
    def to_json_schema(cls: Type[T]) -> Dict[str, Any]:
        """
        Generate a JSON schema representation of the model.

        This method creates a JSON schema dictionary that describes the structure
        of the model, including the types of fields, required properties, and nested models.
        The schema follows the JSON Schema specification (draft-07), making it compatible
        with most JSON validation tools and libraries.

        The generated schema includes:
        - A title representing the model's class name.
        - A `properties` dictionary defining each field's type and constraints.
        - A `required` list that indicates fields without default values.

        Returns:
            Dict[str, Any]: The JSON schema as a dictionary.

        Example:
            >>> class Address(CPrototype):
            ...     street: str
            ...     city: str
            ...     zip_code: int
            ...
            >>> class User(CPrototype):
            ...     username: str
            ...     age: int
            ...     email: str
            ...     address: Address
            ...
            >>> schema = User.to_json_schema()
            >>> print(json.dumps(schema, indent=2))
            {
              "type": "object",
              "title": "User",
              "properties": {
                "username": {
                  "type": "string"
                },
                "age": {
                  "type": "integer"
                },
                "email": {
                  "type": "string"
                },
                "address": {
                  "$ref": "#/definitions/Address"
                }
              },
              "required": [
                "username",
                "age",
                "email",
                "address"
              ]
            }

            >>> address_schema = Address.to_json_schema()
            >>> print(json.dumps(address_schema, indent=2))
            {
              "type": "object",
              "title": "Address",
              "properties": {
                "street": {
                  "type": "string"
                },
                "city": {
                  "type": "string"
                },
                "zip_code": {
                  "type": "integer"
                }
              },
              "required": [
                "street",
                "city",
                "zip_code"
              ]
            }

        Note:
            Nested models are referenced using the `$ref` keyword, pointing to their
            respective definitions within the same schema. This supports complex structures
            and reusable model definitions.

        Raises:
            ValueError: If an unsupported type hint is encountered in the model.
        """
        schema: Dict[str, Any] = msgspec.json.schema(cls)
        return schema


class Tokenizer(CPrototype): 
    @classmethod
    def of(cls, model: AIModel) -> Tokenizer:
        ...

    def tokenize(self, text: str) -> List[str]:
        raise NotImplementedError


class VertexAITokenizer(Tokenizer):
    def tokenize(self, text: str) -> List[str]:
        return []

Note that I'm explicitly raising NotImplementedError.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant