Skip to content

Commit

Permalink
Update webcrawler.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Dawid-Okw authored May 11, 2024
1 parent 5de8902 commit 69ae6b7
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions apps/webcrawler/webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

def load_web():
logger.info("load_web")
print(f"Loading of webpages started")
print("Loading of webpages started")
# Initialize urls to parse
urls = [
"https://www.startmunich.de",
Expand All @@ -46,14 +46,14 @@ def load_web():
# Process raw HTML docs to text docs
html2text = Html2TextTransformer()
docs = html2text.transform_documents(docs)
print(f"Loading of webpages finished")
print("Loading of webpages finished")
logger.info("load_web finished")
return docs


def write_db():
logger.info(f"start writing web_data")
print(f"Start writing web_data to neo4j")
print("Start writing web_data to neo4j")

docs = load_web()
# write each page into the neo4J database using the url as the id
Expand All @@ -78,15 +78,15 @@ def write_db():
content=page.page_content
)
logger.info(f"page: " + page.metadata.get("source") + " added to neo4j")
print(f"page: " + page.metadata.get("source") + " added to neo4j")
print("page: " + page.metadata.get("source") + " added to neo4j")

# send post request to '/enqueue_web' endpoint with web_ids
# to add the webpage to the queue
# cf. vectordb_sync/vectordb_sync.py

requests.post("http://vectordb_sync:5000/enqueue_web", json={"ids": web_ids})
logger.info(f"adding web_ids: to queue")
print(f"adding web_ids: to queue")
print("adding web_ids: to queue")


if __name__ == '__main__':
Expand Down

0 comments on commit 69ae6b7

Please sign in to comment.