diff --git a/.env.sample b/.env.sample index ad51840680..2005982a66 100644 --- a/.env.sample +++ b/.env.sample @@ -118,3 +118,19 @@ PROMPTFLOW_RESPONSE_TIMEOUT=120 PROMPTFLOW_REQUEST_FIELD_NAME=query PROMPTFLOW_RESPONSE_FIELD_NAME=reply PROMPTFLOW_CITATIONS_FIELD_NAME=documents +# Chat with data: MongoDB database +MONGODB_ENDPOINT= +MONGODB_USERNAME= +MONGODB_PASSWORD= +MONGODB_DATABASE_NAME= +MONGODB_COLLECTION_NAME= +MONGODB_APP_NAME= +MONGODB_INDEX_NAME= +MONGODB_TOP_K= +MONGODB_STRICTNESS= +MONGODB_ENABLE_IN_DOMAIN= +MONGODB_CONTENT_COLUMNS= +MONGODB_FILENAME_COLUMN= +MONGODB_TITLE_COLUMN= +MONGODB_URL_COLUMN= +MONGODB_VECTOR_COLUMNS= diff --git a/README.md b/README.md index 0f0f9b8501..b6f8401bc8 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ This repo contains sample code for a simple chat webapp that integrates with Azu - Elasticsearch index (preview) - Pinecone index (private preview) - Azure SQL Server (private preview) + - Mongo DB (preview) ## Configure the app @@ -283,6 +284,34 @@ Note: RBAC assignments can take a few minutes before becoming effective. - `AZURE_OPENAI_EMBEDDING_NAME`: the name of your Ada (text-embedding-ada-002) model deployment on your Azure OpenAI resource. - `PINECONE_VECTOR_COLUMNS`: the vector columns in your index to use when searching. Join them with `|` like `contentVector|titleVector`. +#### Chat with your data using Mongo DB (Private Preview) + +1. Update the `AZURE_OPENAI_*` environment variables as described in the [basic chat experience](#basic-chat-experience) above. + +2. To connect to your data, you need to specify an Mongo DB database configuration. Learn more about [MongoDB](https://www.mongodb.com/). + +3. Configure data source settings as described in the table below. + + | App Setting | Required? | Default Value | Note | + | --- | --- | --- | ------------- | + |DATASOURCE_TYPE|Yes||Must be set to `MongoDB`| + |MONGODB_CONNECTION_STRING|Yes||The connection string used to connect to your Mongo DB instance| + |MONGODB_VECTOR_INDEX|Yes||The name of your Mongo DB vector index| + |MONGODB_DATABASE_NAME|Yes||The name of your Mongo DB database| + |MONGODB_CONTAINER_NAME|Yes||The name of your Mongo DB container| + |MONGODB_TOP_K|No|5|The number of documents to retrieve when querying your search index.| + |MONGODB_ENABLE_IN_DOMAIN|No|True|Limits responses to only queries relating to your data.| + |MONGODB_STRICTNESS|No|3|Integer from 1 to 5 specifying the strictness for the model limiting responses to your data.| + |MONGODB_CONTENT_COLUMNS|No||List of fields in your search index that contains the text content of your documents to use when formulating a bot response. Represent these as a string joined with "|", e.g. `"product_description|product_manual"`| + |MONGODB_FILENAME_COLUMN|No|| Field from your search index that gives a unique identifier of the source of your data to display in the UI.| + |MONGODB_TITLE_COLUMN|No||Field from your search index that gives a relevant title or header for your data content to display in the UI.| + |MONGODB_URL_COLUMN|No||Field from your search index that contains a URL for the document, e.g. an Azure Blob Storage URI. This value is not currently used.| + |MONGODB_VECTOR_COLUMNS|No||List of fields in your search index that contain vector embeddings of your documents to use when formulating a bot response. Represent these as a string joined with "|", e.g. `"product_description|product_manual"`| + + MongoDB uses vector search by default, so ensure these settings are configured on your app: + - `AZURE_OPENAI_EMBEDDING_NAME`: the name of your Ada (text-embedding-ada-002) model deployment on your Azure OpenAI resource. + - `MONGODB_VECTOR_COLUMNS`: the vector columns in your index to use when searching. Join them with `|` like `contentVector|titleVector`. + #### Chat with your data using Azure SQL Server (Private Preview) 1. Update the `AZURE_OPENAI_*` environment variables as described in the [basic chat experience](#basic-chat-experience) above. diff --git a/backend/settings.py b/backend/settings.py index dda4492cc5..9438179888 100644 --- a/backend/settings.py +++ b/backend/settings.py @@ -625,7 +625,8 @@ class _AzureSqlServerSettings(BaseSettings, DatasourcePayloadConstructor): model_config = SettingsConfigDict( env_prefix="AZURE_SQL_SERVER_", env_file=DOTENV_PATH, - extra="ignore" + extra="ignore", + env_ignore_empty=True ) _type: Literal["azure_sql_server"] = PrivateAttr(default="azure_sql_server") @@ -658,7 +659,84 @@ def construct_payload_configuration( "parameters": parameters } + +class _MongoDbSettings(BaseSettings, DatasourcePayloadConstructor): + model_config = SettingsConfigDict( + env_prefix="MONGODB_", + env_file=DOTENV_PATH, + extra="ignore", + env_ignore_empty=True + ) + _type: Literal["mongo_db"] = PrivateAttr(default="mongo_db") + + endpoint: str + username: str = Field(exclude=True) + password: str = Field(exclude=True) + database_name: str + collection_name: str + app_name: str + index_name: str + query_type: Literal["vector"] = "vector" + top_k: int = Field(default=5, serialization_alias="top_n_documents") + strictness: int = 3 + enable_in_domain: bool = Field(default=True, serialization_alias="in_scope") + content_columns: Optional[List[str]] = Field(default=None, exclude=True) + vector_columns: Optional[List[str]] = Field(default=None, exclude=True) + title_column: Optional[str] = Field(default=None, exclude=True) + url_column: Optional[str] = Field(default=None, exclude=True) + filename_column: Optional[str] = Field(default=None, exclude=True) + + + # Constructed fields + authentication: Optional[dict] = None + embedding_dependency: Optional[dict] = None + fields_mapping: Optional[dict] = None + + @field_validator('content_columns', 'vector_columns', mode="before") + @classmethod + def split_columns(cls, comma_separated_string: str) -> List[str]: + if isinstance(comma_separated_string, str) and len(comma_separated_string) > 0: + return parse_multi_columns(comma_separated_string) + + return None + + @model_validator(mode="after") + def set_fields_mapping(self) -> Self: + self.fields_mapping = { + "content_fields": self.content_columns, + "title_field": self.title_column, + "url_field": self.url_column, + "filepath_field": self.filename_column, + "vector_fields": self.vector_columns + } + return self + + @model_validator(mode="after") + def construct_authentication(self) -> Self: + self.authentication = { + "type": "username_and_password", + "username": self.username, + "password": self.password + } + return self + def construct_payload_configuration( + self, + *args, + **kwargs + ): + self.embedding_dependency = \ + self._settings.azure_openai.extract_embedding_dependency() + + parameters = self.model_dump(exclude_none=True, by_alias=True) + parameters.update(self._settings.search.model_dump(exclude_none=True, by_alias=True)) + + return { + "type": self._type, + "parameters": parameters + } + + class _BaseSettings(BaseSettings): model_config = SettingsConfigDict( env_file=DOTENV_PATH, @@ -729,6 +807,10 @@ def set_datasource_settings(self) -> Self: elif self.base_settings.datasource_type == "AzureSqlServer": self.datasource = _AzureSqlServerSettings(settings=self, _env_file=DOTENV_PATH) logging.debug("Using SQL Server") + + elif self.base_settings.datasource_type == "MongoDB": + self.datasource = _MongoDbSettings(settings=self, _env_file=DOTENV_PATH) + logging.debug("Using Mongo DB") else: self.datasource = None @@ -736,8 +818,9 @@ def set_datasource_settings(self) -> Self: return self - except ValidationError: + except ValidationError as e: logging.warning("No datasource configuration found in the environment -- calls will be made to Azure OpenAI without grounding data.") + logging.warning(e.errors()) app_settings = _AppSettings()