Skip to content

Commit

Permalink
- changed the retrival method a little.
Browse files Browse the repository at this point in the history
- Made modifications in Nutrition facts scraper to modify any list in form of a string
  • Loading branch information
manikg08 committed Jun 30, 2024
1 parent 0251d5d commit 477abff
Show file tree
Hide file tree
Showing 13 changed files with 509 additions and 3 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Binary file added chroma_db/chroma.sqlite3
Binary file not shown.
4 changes: 3 additions & 1 deletion src/backend/RAG/LangChain_Implementation/basic_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def _extract_from_dict(self, doc_dict, chunks_with_metadata):
chroma_db.add_documents(documents=document_chunks)

# ridiculous linting rules - thanks:
query_text1 = 'what did the executive at the Kellogs ad firm say? '
query_text1 = 'Can Blueberries Help with Diabetes and Repairing DNA?'
query_text2 = 'What can we conclude from this response?'
query_text = query_text1 + query_text2

Expand All @@ -104,6 +104,8 @@ def _extract_from_dict(self, doc_dict, chunks_with_metadata):
metadata_str = ', '.join([f'{key}: {value}' for key, value in metadata.items()])
relevant_info += f'Content: {doc_content}\nMetadata: {metadata_str}\n\n'

print('Relevant information is:', relevant_info)

modified_prompt = {'text': f'{query_text}\n\nHere is some relevant information:\n{relevant_info}'}

prompt_text = modified_prompt['text']
Expand Down
500 changes: 500 additions & 0 deletions src/backend/RAG/LangChain_Implementation/blog_data.json

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
8 changes: 6 additions & 2 deletions src/backend/Scrapers/Nutritionfacts/nutrition.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def get_url_content(self, driver, blog_url):
image_elements = content_element.find_all('img')
image_urls = [img['src'] for img in image_elements if 'src' in img.attrs]

return title, date, author, content_chunks, key_take_away_chunks, image_urls, blog_url
return (title, date, author, content_chunks, key_take_away_chunks, image_urls, blog_url)
except Exception:
print(f'Error getting content from url: {blog_url}')
error_msg = f'Error getting content from url: {blog_url}'
Expand All @@ -226,6 +226,10 @@ def get_url_content(self, driver, blog_url):

def get_documents(self, data: TypeNutritionScrappingData) -> List[Document]:
transcript = data.get('transcript', '')

if isinstance(transcript, list):
transcript = ' '.join(transcript)

chunks = get_text_chunks(transcript)
metadata = {
'author': data.get('author', ''),
Expand All @@ -249,7 +253,7 @@ def _scrape(self) -> str:
if nutrition is None:
raise ValueError('Data does not exist for id: ' + str(self.element_id))

title, date, author, content_chunks, key_take_away_chunks, image_urls, blog_url = (
(title, date, author, content_chunks, key_take_away_chunks, image_urls, blog_url) = (
nutrition
)
info: TypeNutritionScrappingData = {
Expand Down

0 comments on commit 477abff

Please sign in to comment.