diff --git a/dialog_lib/loaders/csv.py b/dialog_lib/loaders/csv.py index f3ffe96..d441081 100644 --- a/dialog_lib/loaders/csv.py +++ b/dialog_lib/loaders/csv.py @@ -1,3 +1,4 @@ +import logging from dialog_lib.db import get_session from dialog_lib.db.models import CompanyContent from dialog_lib.embeddings.generate import generate_embedding @@ -6,6 +7,9 @@ from langchain_community.document_loaders.csv_loader import CSVLoader +logger = logging.getLogger(__name__) + + def load_csv( file_path, dbsession=get_session, embeddings_model_instance=None, embedding_llm_model=None, embedding_llm_api_key=None, company_id=None @@ -28,6 +32,10 @@ def load_csv( values = line.split(": ") content[values[0]] = values[1] + + if not dbsession.query(CompanyContent).filter( + CompanyContent.question == content["question"], CompanyContent.content == content["content"] + ).first(): company_content = CompanyContent( category="csv", subcategory="csv-content", @@ -37,5 +45,5 @@ def load_csv( embedding=generate_embedding(csv_content.page_content, embeddings_model_instance) ) session.add(company_content) - - return company_content \ No newline at end of file + else: + logger.warning(f"Question: {content['question']} already exists in the database. Skipping.") \ No newline at end of file diff --git a/dialog_lib/loaders/gsheets.py b/dialog_lib/loaders/gsheets.py index 6346ec4..c7e17c1 100644 --- a/dialog_lib/loaders/gsheets.py +++ b/dialog_lib/loaders/gsheets.py @@ -1,4 +1,5 @@ import gspread +import logging from dialog_lib.db.models import CompanyContent from dialog_lib.embeddings.generate import generate_embedding @@ -11,6 +12,8 @@ from typing import Any, Dict, Iterator, List, Optional, Sequence, Union +logger = logging.getLogger(__name__) + class GoogleSheetsLoader(BaseLoader): def __init__(self, credentials_path: Union[str, Path], spreadsheet_url: str, sheet_name: str): self.sheet_name = sheet_name @@ -59,15 +62,19 @@ def load_google_sheets( values = line.split(": ") content[values[0]] = values[1] - company_content = CompanyContent( - category="csv", - subcategory="csv-content", - question=content["question"], - content=content["content"], - dataset=company_id, - embedding=generate_embedding(csv_content.page_content, embeddings_model_instance) - ) - dbsession.add(company_content) + if not dbsession.query(CompanyContent).filter( + CompanyContent.question == content["question"], CompanyContent.content == content["content"] + ).first(): + company_content = CompanyContent( + category="csv", + subcategory="csv-content", + question=content["question"], + content=content["content"], + dataset=company_id, + embedding=generate_embedding(csv_content.page_content, embeddings_model_instance) + ) + dbsession.add(company_content) + else: + logger.warning(f"Question: {content['question']} already exists in the database. Skipping.") - dbsession.commit() - return company_content \ No newline at end of file + dbsession.commit() \ No newline at end of file